def __init__(self, **kwargs): super(H2OGradientBoostingEstimator, self).__init__() self._parms = {} names_list = { "model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_models", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "score_each_iteration", "score_tree_interval", "fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "offset_column", "weights_column", "balance_classes", "class_sampling_factors", "max_after_balance_size", "max_confusion_matrix_size", "max_hit_ratio_k", "ntrees", "max_depth", "min_rows", "nbins", "nbins_top_level", "nbins_cats", "r2_stopping", "stopping_rounds", "stopping_metric", "stopping_tolerance", "max_runtime_secs", "seed", "build_tree_one_node", "learn_rate", "learn_rate_annealing", "distribution", "quantile_alpha", "tweedie_power", "huber_alpha", "checkpoint", "sample_rate", "sample_rate_per_class", "col_sample_rate", "col_sample_rate_change_per_level", "col_sample_rate_per_tree", "min_split_improvement", "histogram_type", "max_abs_leafnode_pred", "pred_noise_bandwidth", "categorical_encoding", "calibrate_model", "calibration_frame", "custom_metric_func", "export_checkpoints_dir", "monotone_constraints" } if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def __init__(self, **kwargs): super(H2ODeepWaterEstimator, self).__init__() self._parms = {} names_list = { "model_id", "checkpoint", "autoencoder", "training_frame", "validation_frame", "nfolds", "balance_classes", "max_after_balance_size", "class_sampling_factors", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment", "fold_column", "response_column", "offset_column", "weights_column", "ignored_columns", "score_each_iteration", "categorical_encoding", "overwrite_with_best_model", "epochs", "train_samples_per_iteration", "target_ratio_comm_to_comp", "seed", "standardize", "learning_rate", "learning_rate_annealing", "momentum_start", "momentum_ramp", "momentum_stable", "distribution", "score_interval", "score_training_samples", "score_validation_samples", "score_duty_cycle", "classification_stop", "regression_stop", "stopping_rounds", "stopping_metric", "stopping_tolerance", "max_runtime_secs", "ignore_const_cols", "shuffle_training_data", "mini_batch_size", "clip_gradient", "network", "backend", "image_shape", "channels", "sparse", "gpu", "device_id", "network_definition_file", "network_parameters_file", "mean_image_file", "export_native_parameters_prefix", "activation", "hidden", "input_dropout_ratio", "hidden_dropout_ratios", "problem_type" } if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def __init__(self, **kwargs): super(H2OWord2vecEstimator, self).__init__() self._parms = {} names_list = { "model_id", "training_frame", "min_word_freq", "word_model", "norm_model", "vec_size", "window_size", "sent_sample_rate", "init_learning_rate", "epochs", "pre_trained", "max_runtime_secs", "export_checkpoints_dir" } if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname == 'pre_trained': setattr(self, pname, pvalue) self._determine_vec_size() setattr(self, 'vec_size', self.vec_size) elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def __init__(self, **kwargs): super(H2OXGBoostEstimator, self).__init__() self._parms = {} names_list = {"model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "score_each_iteration", "fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "offset_column", "weights_column", "stopping_rounds", "stopping_metric", "stopping_tolerance", "max_runtime_secs", "seed", "distribution", "tweedie_power", "categorical_encoding", "quiet_mode", "ntrees", "max_depth", "min_rows", "min_child_weight", "learn_rate", "eta", "sample_rate", "subsample", "col_sample_rate", "colsample_bylevel", "col_sample_rate_per_tree", "colsample_bytree", "max_abs_leafnode_pred", "max_delta_step", "score_tree_interval", "min_split_improvement", "gamma", "nthread", "max_bins", "max_leaves", "min_sum_hessian_in_leaf", "min_data_in_leaf", "sample_type", "normalize_type", "rate_drop", "one_drop", "skip_drop", "tree_method", "grow_policy", "booster", "reg_lambda", "reg_alpha", "dmatrix_type", "backend", "gpu_id"} if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def __init__(self, **kwargs): super(H2OKMeansEstimator, self).__init__() self._parms = {} names_list = { "model_id", "training_frame", "validation_frame", "nfolds", "keep_cross_validation_models", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment", "fold_column", "ignored_columns", "ignore_const_cols", "score_each_iteration", "k", "estimate_k", "user_points", "max_iterations", "standardize", "seed", "init", "max_runtime_secs", "categorical_encoding" } if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def __init__(self, **kwargs): super(H2OGeneralizedLinearEstimator, self).__init__() self._parms = {} names_list = {"model_id", "training_frame", "validation_frame", "nfolds", "seed", "keep_cross_validation_models", "keep_cross_validation_predictions", "keep_cross_validation_fold_assignment", "fold_assignment", "fold_column", "response_column", "ignored_columns", "ignore_const_cols", "score_each_iteration", "offset_column", "weights_column", "family", "tweedie_variance_power", "tweedie_link_power", "theta", "solver", "alpha", "lambda_", "lambda_search", "early_stopping", "nlambdas", "standardize", "missing_values_handling", "compute_p_values", "remove_collinear_columns", "intercept", "non_negative", "max_iterations", "objective_epsilon", "beta_epsilon", "gradient_epsilon", "link", "prior", "lambda_min_ratio", "beta_constraints", "max_active_predictors", "interactions", "interaction_pairs", "obj_reg", "export_checkpoints_dir", "balance_classes", "class_sampling_factors", "max_after_balance_size", "max_confusion_matrix_size", "max_hit_ratio_k", "max_runtime_secs", "custom_metric_func"} if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda") for pname, pvalue in kwargs.items(): if pname == 'model_id': self._id = pvalue self._parms["model_id"] = pvalue elif pname in names_list: # Using setattr(...) will invoke type-checking of the arguments setattr(self, pname, pvalue) else: raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
def varimp_plot(self, num_of_features=None, server=False): """ Plot the variable importance for a trained model. :param num_of_features: the number of features shown in the plot. :param server: ? :returns: None. """ assert_is_type(num_of_features, None, int) assert_is_type(server, bool) plt = _get_matplotlib_pyplot(server) if not plt: return # check if the model is a glm if self._model_json["algo"] == "glm": # print statement to used std_coef_plot(), and use std_coef_plt instead print("Variable importance does not apply to GLM. Will use std_coef_plot() instead.") self.std_coef_plot(num_of_features) return # get the variable importances as a list of tuples, do not use pandas dataframe importances = self.varimp(use_pandas=False) # features labels correspond to the first value of each tuple in the importances list feature_labels = [tup[0] for tup in importances] # relative importances correspond to the first value of each tuple in the importances list scaled_importances = [tup[2] for tup in importances] # specify bar centers on the y axis, but flip the order so largest bar appears at top pos = range(len(feature_labels))[::-1] # specify the bar lengths val = scaled_importances # check that num_of_features is an integer if num_of_features is None: num_of_features = len(val) fig, ax = plt.subplots(1, 1, figsize=(14, 10)) # create separate plot for the case where num_of_features == 1 if num_of_features == 1: plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center", height=0.8, color="#1F77B4", edgecolor="none") # Hide the right and top spines, color others grey ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_color("#7B7B7B") ax.spines["left"].set_color("#7B7B7B") # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position("left") ax.xaxis.set_ticks_position("bottom") plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features]) ax.margins(y=0.5) else: plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center", height=0.8, color="#1F77B4", edgecolor="none") # Hide the right and top spines, color others grey ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_color("#7B7B7B") ax.spines["left"].set_color("#7B7B7B") # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position("left") ax.xaxis.set_ticks_position("bottom") plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features]) ax.margins(y=0.5) # check which algorithm was used to select right plot title if self._model_json["algo"] == "gbm": plt.title("Variable Importance: H2O GBM", fontsize=20) if not server: plt.show() elif self._model_json["algo"] == "drf": plt.title("Variable Importance: H2O DRF", fontsize=20) if not server: plt.show() # if H2ODeepLearningEstimator has variable_importances == True elif self._model_json["algo"] == "deeplearning": plt.title("Variable Importance: H2O Deep Learning", fontsize=20) if not server: plt.show() else: raise H2OValueError("A variable importances plot is not implemented for this type of model")
def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False): """ Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot. :param cols: Feature(s) for which partial dependence will be calculated. :param destination_key: An key reference to the created partial dependence tables in H2O. :param nbins: Number of bins used. :param plot: A boolean specifying whether to plot partial dependence table. :param figsize: Dimension/size of the returning plots, adjust to fit your output cells. :param server: ? :return: Plot and list of calculated mean response tables for each feature requested. """ if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame") assert_is_type(cols, [str]) assert_is_type(destination_key, None, str) assert_is_type(nbins, int) assert_is_type(plot, bool) assert_is_type(figsize, (int,int)) ## Check cols specified exist in frame data for xi in cols: if not xi in data.names: raise H2OValueError("Column %s does not exist in the training frame" % xi) kwargs = {} kwargs['cols'] = cols kwargs['model_id'] = self.model_id kwargs['frame_id'] = data.frame_id kwargs['nbins'] = nbins kwargs['destination_key'] = destination_key json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs), job_type="PartialDependencePlot").poll() json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key) # Extract partial dependence data from json response # pps = json pps = json['partial_dependence_data'] ## Plot partial dependence plots using matplotlib if plot: plt = _get_matplotlib_pyplot(server) if not plt: return fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize) for i, pp in enumerate(pps): ## Check weather column was categorical or numeric col=cols[i] cat=data[col].isfactor()[0] if cat: labels = pp[0] x = range(len(labels)) y = pp[1] axs[i,0].plot(x, y, 'o') axs[i,0].set_xticks(x) axs[i,0].set_xticklabels(labels) axs[i,0].margins(0.2) else: axs[i,0].plot(pp[0], pp[1]) axs[i,0].set_xlim(min(pp[0]), max(pp[0])) axs[i,0].set_title('Partial Dependence Plot For {}'.format(col)) axs[i,0].set_xlabel(pp.col_header[0]) axs[i,0].set_ylabel(pp.col_header[1]) axs[i,0].xaxis.grid() axs[i,0].yaxis.grid() if len(col) >1: fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0) return pps
def _plot(self, timestep, metric, server=False): plt = _get_matplotlib_pyplot(server) if not plt: return scoring_history = self.scoring_history() # Separate functionality for GLM since its output is different from other algos if self._model_json["algo"] == "glm": # GLM has only one timestep option, which is `iteration` timestep = "iteration" if metric == "AUTO": metric = "log_likelihood" elif metric not in ("log_likelihood", "objective"): raise H2OValueError("for GLM, metric must be one of: log_likelihood, objective") plt.xlabel(timestep) plt.ylabel(metric) plt.title("Validation Scoring History") plt.plot(scoring_history[timestep], scoring_history[metric]) elif self._model_json["algo"] in ("deeplearning", "deepwater", "drf", "gbm"): # Set timestep if self._model_json["algo"] in ("gbm", "drf"): assert_is_type(timestep, "AUTO", "duration", "number_of_trees") if timestep == "AUTO": timestep = "number_of_trees" else: # self._model_json["algo"] == "deeplearning": # Delete first row of DL scoring history since it contains NAs & NaNs if scoring_history["samples"][0] == 0: scoring_history = scoring_history[1:] assert_is_type(timestep, "AUTO", "epochs", "samples", "duration") if timestep == "AUTO": timestep = "epochs" training_metric = "training_{}".format(metric) validation_metric = "validation_{}".format(metric) if timestep == "duration": dur_colname = "duration_{}".format(scoring_history["duration"][1].split()[1]) scoring_history[dur_colname] = [str(x).split()[0] for x in scoring_history["duration"]] timestep = dur_colname if can_use_pandas(): valid = validation_metric in list(scoring_history) ylim = (scoring_history[[training_metric, validation_metric]].min().min(), scoring_history[[training_metric, validation_metric]].max().max()) if valid \ else (scoring_history[training_metric].min(), scoring_history[training_metric].max()) else: valid = validation_metric in scoring_history.col_header ylim = (min(min(scoring_history[[training_metric, validation_metric]])), max(max(scoring_history[[training_metric, validation_metric]]))) if valid \ else (min(scoring_history[training_metric]), max(scoring_history[training_metric])) if ylim[0] == ylim[1]: ylim = (0, 1) if valid: # Training and validation scoring history plt.xlabel(timestep) plt.ylabel(metric) plt.title("Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric], label="Training") plt.plot(scoring_history[timestep], scoring_history[validation_metric], color="orange", label="Validation") plt.legend() else: # Training scoring history only plt.xlabel(timestep) plt.ylabel(training_metric) plt.title("Training Scoring History") plt.ylim(ylim) plt.plot(scoring_history[timestep], scoring_history[training_metric]) else: # algo is not glm, deeplearning, drf, gbm raise H2OValueError("Plotting not implemented for this type of model") if not server: plt.show()
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ # Minimal required arguments are training_frame and y (response) self.training_frame = training_frame ncols = self.training_frame.ncols names = self.training_frame.names if y is None and self.response_column is None: raise H2OValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) elif y is not None: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) self.response_column = y self.fold_column = fold_column self.weights_column = weights_column self.validation_frame = validation_frame self.leaderboard_frame = leaderboard_frame self.blending_frame = blending_frame if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) ignored_columns = set(names) - xset for col in [y, fold_column, weights_column]: if col is not None and col in ignored_columns: ignored_columns.remove(col) if ignored_columns is not None: self.input_spec['ignored_columns'] = list(ignored_columns) def clean_params(params): return ({ k: clean_params(v) for k, v in params.items() if v is not None } if isinstance(params, dict) else H2OEstimator._keyify(params)) automl_build_params = clean_params( dict( build_control=self.build_control, build_models=self.build_models, input_spec=self.input_spec, )) resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: raise H2OResponseError( "Backend failed to build the AutoML job: {}".format(resp)) if not self.project_name: self.project_name = resp['build_control']['project_name'] self.__frozen = True self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch() return self.leader
def __init__(self, algorithm, min_rule_len=1, max_rule_len=10, max_num_rules=None, nfolds=5, seed=-1, tree_params={}, glm_params={}): if algorithm not in ["DRF", "XGBoost", "GBM"]: raise H2OValueError( "{} is not a supported algorithm".format(algorithm)) self.algorithm = algorithm self.min_rule_len = min_rule_len self.max_rule_len = max_rule_len self.max_num_rules = max_num_rules self.nfolds = nfolds self.seed = seed if tree_params: tree_params.pop("model_id", None) if 'max_depth' in tree_params.keys(): self.min_rule_len = tree_params.get("max_depth") self.max_rule_len = tree_params.get("max_depth") tree_params.pop("max_depth") warnings.warn( 'max_depth provided in tree_params - min_rule_len and max_rule_len will be ignored' ) if 'nfolds' in tree_params.keys(): tree_params.pop('nfolds') warnings.warn( 'seed provided in tree_params but will be ignored') if 'seed' in tree_params.keys(): tree_params.pop('seed') warnings.warn( 'seed provided in tree_params but will be ignored') if glm_params: glm_params.pop("model_id", None) if 'max_active_predictors' in glm_params.keys(): self.max_num_rules = glm_params.get( "max_active_predictors") - 1 glm_params.pop("max_active_predictors") warnings.warn( 'max_active_predictors provided in glm_params - max_num_rules will be ignored' ) if 'nfolds' in glm_params.keys(): glm_params.pop('nfolds') warnings.warn( 'seed provided in glm_params but will be ignored') if 'seed' in glm_params.keys(): glm_params.pop('seed') warnings.warn( 'seed provided in glm_params but will be ignored') if 'alpha' in glm_params.keys(): glm_params.pop('alpha') warnings.warn('alpha ignored - set to 1 by rulefit') if 'lambda_' in glm_params.keys(): glm_params.pop('lambda_') warnings.warn('lambda_ ignored by rulefit') self.tree_params = tree_params self.glm_params = glm_params
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._requires_training_frame() and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if algo=="pca" and "k" not in parms.keys(): parms["k"] = 1 if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms))) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True) ncols = training_frame.ncols names = training_frame.names # Minimal required arguments are training_frame and y (response) if y is None: raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame') input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if blending_frame is not None: blending_frame = H2OFrame._validate(blending_frame, 'blending_frame') input_spec['blending_frame'] = blending_frame.frame_id if self.sort_metric is not None: assert_is_type(self.sort_metric, str) sort_metric = self.sort_metric.lower() # Changed the API to use "deviance" to be consistent with stopping_metric values # TO DO: let's change the backend to use "deviance" since we use the term "deviance" # After that we can take this `if` statement out if sort_metric == "deviance": sort_metric = "mean_residual_deviance" input_spec['sort_metric'] = sort_metric if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None and fold_column in ignored_columns: ignored_columns.remove(fold_column) if weights_column is not None and weights_column in ignored_columns: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return if not self.project_name: self.build_control['project_name'] = self.project_name = resp['build_control']['project_name'] self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch()
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, **ignored): """ Train the H2O model. Parameters ---------- x : list, None A list of column names or indices indicating the predictor columns. y : An index or a column name indicating the response column. training_frame : H2OFrame The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). offset_column : str, optional The name or index of the column in training_frame that holds the offsets. fold_column : str, optional The name or index of the column in training_frame that holds the per-row fold assignments. weights_column : str, optional The name or index of the column in training_frame that holds the per-row weights. validation_frame : H2OFrame, optional H2OFrame with validation data to be scored on while training. max_runtime_secs : float Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not (is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[ y] == "enum" else "regressor" elif y is not None: raise H2OValueError( "y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError( "Properties x and ignored_columns cannot be specified simultaneously" ) for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError( "Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) parms["x"] = list(xset) parms["y"] = y parms["training_frame"] = training_frame parms["validation_frame"] = validation_frame parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs self._build_model(parms)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, **params): """ Train the H2O model. Parameters ---------- x : list, None A list of column names or indices indicating the predictor columns. y : str, int An index or a column name indicating the response column. training_frame : H2OFrame The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). offset_column : str, optional The name or index of the column in training_frame that holds the offsets. fold_column : str, optional The name or index of the column in training_frame that holds the per-row fold assignments. weights_column : str, optional The name or index of the column in training_frame that holds the per-row weights. validation_frame : H2OFrame, optional H2OFrame with validation data to be scored on while training. max_runtime_secs : float Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) algo = self._compute_algo() algo_params = locals() parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not (is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm"}) if y is None: if is_supervised and "response" in training_frame.names: y = "response" else: if is_auto_encoder: raise H2OValueError( "y should not be provided for an autoencoder model") if isinstance(y, (list, tuple)): if len(y) == 1: parms["y"] = y[0] else: raise ValueError("y must be a single column reference") self._estimator_type = "classifier" if training_frame[y].isfactor( ) else "regressor" if x is None: x = set(training_frame.names) if is_type(y, int): x -= {training_frame.names[y]} if is_type(y, str): x -= {y} x = list(x) parms["x"] = x parms["y"] = y parms["training_frame"] = training_frame parms["validation_frame"] = validation_frame parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs self.build_model(parms)
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, port="54321+", extra_classpath=None, verbose=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param extra_classpath List of paths to libraries that should be included on the Java classpath. :param verbose: If True, then connection info will be printed to the stdout. :returns: a new H2OLocalServer instance """ assert_is_type(jar_path, None, str) assert_is_type(port, None, int, str) assert_is_type(nthreads, -1, BoundInt(1, 4096)) assert_is_type(enable_assertions, bool) assert_is_type(min_mem_size, None, int) assert_is_type(max_mem_size, None, BoundInt(1 << 25)) assert_is_type(ice_root, None, I(str, os.path.isdir)) assert_is_type(extra_classpath, None, [str]) if jar_path: assert_satisfies(jar_path, jar_path.endswith("h2o.jar")) if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size: raise H2OValueError( "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)) if port is None: port = "54321+" baseport = None # TODO: get rid of this port gimmick and have 2 separate parameters. if is_type(port, str): if port.isdigit(): port = int(port) else: if not (port[-1] == "+" and port[:-1].isdigit()): raise H2OValueError( "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port) baseport = int(port[:-1]) port = 0 hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._extra_classpath = extra_classpath hs._ice_root = ice_root if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size) if verbose: print(" Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the rulfit model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit """ family = "gaussian" if (training_frame.type(y) == "enum"): if training_frame[y].unique().nrow > 2: raise H2OValueError("Multinomial not supported") else: family = "binomial" # Get paths from random forest models paths_frame = training_frame[y] depths = range(self.min_depth, self.max_depth + 1) rf_models = [] for model_idx in range(len(depths)): # Train random forest models rf_model = H2ORandomForestEstimator(seed = self.seed, model_id = "rf.hex", max_depth = depths[model_idx]) rf_model.train(y = y, x = x, training_frame = training_frame) rf_models = rf_models + [rf_model] paths = rf_model.predict_leaf_node_assignment(training_frame) paths.col_names = ["rf_" + str(model_idx) +"."+ x for x in paths.col_names] paths_frame = paths_frame.cbind(paths) # Extract important paths glm = H2OGeneralizedLinearEstimator(model_id = "glm.hex", nfolds = self.nfolds, seed = self.seed, family = family, alpha = 1, remove_collinear_columns=True, lambda_search = True) glm.train(y = y, training_frame=paths_frame) intercept, rule_importance = _get_glm_coeffs(glm) rule_importance = pd.DataFrame.from_dict(rule_importance, orient = "index").reset_index() rule_importance.columns = ["variable", "coefficient"] # Convert paths to rules rules = [] for i in rule_importance.variable: if family == "binomial": model_num, tree_num, path = i.replace("rf_", "").replace("T", "").replace("C1.", "").split(".") else: model_num, tree_num, path = i.replace("rf_", "").replace("T", "").split(".") tree = H2OTree(rf_models[int(model_num)], int(tree_num)-1) rules = rules + [_tree_traverser(tree.root_node, path)] # Add rules and order by absolute coefficient rule_importance["rule"] = rules rule_importance["abs_coefficient"] = rule_importance["coefficient"].abs() rule_importance = rule_importance.loc[rule_importance.groupby(["rule"])["abs_coefficient"].idxmax()] rule_importance = rule_importance.sort_values(by = "abs_coefficient", ascending = False) rule_importance = rule_importance.drop("abs_coefficient", axis = 1) self.intercept = intercept self.rule_importance = rule_importance
def assert_true(cond, message): """Same as traditional assert, only raises H2OValueError instead.""" if not cond: raise H2OValueError(message)
def std_coef_plot(self, num_of_features=None, server=False): """ Plot a GLM model's standardized coefficient magnitudes. :param num_of_features: the number of features shown in the plot. :param server: ? :returns: None. """ assert_is_type(num_of_features, None, I(int, lambda x: x > 0)) # check that model is a glm if self._model_json["algo"] != "glm": raise H2OValueError("This function is available for GLM models only") plt = _get_matplotlib_pyplot(server) if not plt: return # get unsorted tuple of labels and coefficients unsorted_norm_coef = self.coef_norm().items() # drop intercept value then sort tuples by the coefficient's absolute value drop_intercept = [tup for tup in unsorted_norm_coef if tup[0] != "Intercept"] norm_coef = sorted(drop_intercept, key=lambda x: abs(x[1]), reverse=True) signage = [] for element in norm_coef: # if positive including zero, color blue, else color orange (use same colors as Flow) if element[1] >= 0: signage.append("#1F77B4") # blue else: signage.append("#FF7F0E") # dark orange # get feature labels and their corresponding magnitudes feature_labels = [tup[0] for tup in norm_coef] norm_coef_magn = [abs(tup[1]) for tup in norm_coef] # specify bar centers on the y axis, but flip the order so largest bar appears at top pos = range(len(feature_labels))[::-1] # specify the bar lengths val = norm_coef_magn # check number of features, default is all the features if num_of_features is None: num_of_features = len(val) # plot horizontal plot fig, ax = plt.subplots(1, 1, figsize=(14, 10)) # create separate plot for the case where num_of_features = 1 if num_of_features == 1: plt.barh(pos[0], val[0], align="center", height=0.8, color=signage[0], edgecolor="none") # Hide the right and top spines, color others grey ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_color("#7B7B7B") ax.spines["left"].set_color("#7B7B7B") # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position("left") ax.xaxis.set_ticks_position("bottom") plt.yticks([0], feature_labels[0]) ax.margins(y=0.5) else: plt.barh(pos[0:num_of_features], val[0:num_of_features], align="center", height=0.8, color=signage[0:num_of_features], edgecolor="none") # Hide the right and top spines, color others grey ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_color("#7B7B7B") ax.spines["left"].set_color("#7B7B7B") # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position("left") ax.xaxis.set_ticks_position("bottom") plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features]) ax.margins(y=0.05) # generate custom fake lines that will be used as legend entries: # check if positive and negative values exist # if positive create positive legend if "#1F77B4" in signage[0:num_of_features] and "#FF7F0E" not in signage[0:num_of_features]: color_ids = {"Positive": "#1F77B4"} markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="") for color in signage[0:num_of_features]] lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13) lgnd.legendHandles[0]._legmarker.set_markersize(10) # if neg create neg legend elif "#FF7F0E" in signage[0:num_of_features] and "#1F77B4" not in signage[0:num_of_features]: color_ids = {"Negative": "#FF7F0E"} markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="") for color in set(signage[0:num_of_features])] lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13) lgnd.legendHandles[0]._legmarker.set_markersize(10) # if both provide both colors in legend else: color_ids = {"Positive": "#1F77B4", "Negative": "#FF7F0E"} markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="") for color in set(signage[0:num_of_features])] lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13) lgnd.legendHandles[0]._legmarker.set_markersize(10) lgnd.legendHandles[1]._legmarker.set_markersize(10) # Hide the right and top spines, color others grey ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_color("#7B7B7B") ax.spines["left"].set_color("#7B7B7B") # Only show ticks on the left and bottom spines # ax.yaxis.set_ticks_position("left") # ax.xaxis.set_ticks_position("bottom") plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features]) plt.tick_params(axis="x", which="minor", bottom="off", top="off", labelbottom="off") plt.title("Standardized Coef. Magnitudes: H2O GLM", fontsize=20) # plt.axis("tight") # show plot if not server: plt.show()
def get_best_model(self, algorithm=None, criterion=None): """ Get best model of a given family/algorithm for a given criterion from an AutoML object. :param algorithm: One of "basemodel", "deeplearning", "drf", "gbm", "glm", "stackedensemble", "xgboost". If None, pick the best model regardless of the algorithm. :param criterion: Criterion can be one of the metrics reported in leaderboard. If set to None, the same ordering as in the leaderboard will be used. Avaliable criteria: - Regression metrics: deviance, rmse, mse, mae, rmsle - Binomial metrics: auc, logloss, aucpr, mean_per_class_error, rmse, mse - Multinomial metrics: mean_per_class_error, logloss, rmse, mse The following additional leaderboard information can be also used as a criterion: - 'training_time_ms': column providing the training time of each model in milliseconds (doesn't include the training of cross validation models). - 'predict_time_per_row_ms`: column providing the average prediction time by the model for a single row. :return: An H2OModel or None if no model of a given family is present :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) >>> gbm = aml.get_best_model("gbm") """ from h2o.exceptions import H2OValueError def _get_models(leaderboard): return [ m[0] for m in leaderboard["model_id"].as_data_frame( use_pandas=False, header=False) ] higher_is_better = ["auc", "aucpr"] assert_is_type(algorithm, None, str) assert_is_type(criterion, None, str) if criterion is not None: criterion = criterion.lower() if "deviance" == criterion: criterion = "mean_residual_deviance" if algorithm is not None: if algorithm.lower() not in ("basemodel", "deeplearning", "drf", "gbm", "glm", "stackedensemble", "xgboost"): raise H2OValueError( "Algorithm \"{}\" is not supported!".format(algorithm)) algorithm = algorithm.lower() extra_cols = ["algo"] if criterion in ("training_time_ms", "predict_time_per_row_ms"): extra_cols.append(criterion) leaderboard = h2o.automl.get_leaderboard(self, extra_columns=extra_cols) leaderboard = leaderboard if algorithm is None else ( leaderboard[leaderboard["algo"].tolower() == algorithm, :] if algorithm != "basemodel" else leaderboard[leaderboard["algo"].tolower() != "stackedensemble", :]) if leaderboard.nrow == 0: return None if criterion is None: return h2o.get_model(leaderboard[0, "model_id"]) if criterion not in leaderboard.columns: raise H2OValueError( "Criterion \"{}\" is not present in the leaderboard!".format( criterion)) models_in_default_order = _get_models(leaderboard) sorted_lb = leaderboard.sort(by=criterion, ascending=criterion not in higher_is_better) selected_models = _get_models( sorted_lb[sorted_lb[criterion] == sorted_lb[0, criterion]]) picked_model = [ model for model in models_in_default_order if model in selected_models ][0] return h2o.get_model(picked_model)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the model synchronously (i.e. do not return until the model finishes training). To train asynchronously call :meth:`start`. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. """ algo_params = locals() parms = self._parms.copy() parms.update({ k: v for k, v in algo_params.items() if k not in ["self", "params", "algo_params", "parms"] }) # dictionaries have special handling in grid search, avoid the implicit conversion parms[ "search_criteria"] = None if self.search_criteria is None else str( self.search_criteria) parms["export_checkpoints_dir"] = self.export_checkpoints_dir parms["parallelism"] = self._parallelism parms["hyper_parameters"] = None if self.hyper_params is None else str( self.hyper_params) # unique to grid search parms.update({ k: v for k, v in list(self.model._parms.items()) if v is not None }) # unique to grid search parms.update(params) if '__class__' in parms: # FIXME: hackt for PY3 del parms['__class__'] y = algo_params["y"] tframe = algo_params["training_frame"] if tframe is None: raise ValueError("Missing training_frame") if y is not None: if is_type(y, list, tuple): if len(y) == 1: parms["y"] = y[0] else: raise ValueError('y must be a single column reference') if x is None: if (isinstance(y, int)): xset = set(range(training_frame.ncols)) - {y} else: xset = set(training_frame.names) - {y} else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-training_frame.ncols <= xi < training_frame.ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(training_frame.names[xi]) else: if xi not in training_frame.names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["x"] = x self.build_model(parms)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, segments=None, segment_models_id=None, parallelism=1, verbose=False): """ Trains H2O model for each segment (subpopulation) of the training dataset. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable. Please note that regardless of how this parameter is set, a model will be built for each input segment. This parameter only affects individual model training. :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset by the segment-by columns and train a separate model for each segment (group of rows). As an alternative to providing a list of columns, users can also supply an explicit enumeration of segments to build the models for. This enumeration needs to be represented as H2OFrame. :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified it will be automatically generated. :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number of models each H2O node will be building in parallel. :param bool verbose: Enable to print additional information during model building. Defaults to False. :examples: >>> response = "survived" >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic[response] = titanic[response].asfactor() >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"] >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234) >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"], ... x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_models.as_frame() """ assert_is_type(segments, None, H2OFrame, [str]) assert_is_type(verbose, bool) assert_is_type(segment_models_id, None, str) assert_is_type(parallelism, int) if segments is None: raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to " "segment-by or an explicit list of segments to build models for.") parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, fold_column=fold_column, weights_column=weights_column, validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, ignored_columns=ignored_columns, model_id=None, verbose=verbose) if isinstance(segments, H2OFrame): parms["segments"] = H2OEstimator._keyify(segments) else: parms["segment_columns"] = segments if segment_models_id: parms["segment_models_id"] = segment_models_id parms["parallelism"] = parallelism rest_ver = self._get_rest_version(parms) train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms) job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build")) job.poll() return H2OSegmentModels(job.dest_key)
def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._options_.get('requires_training_frame', True) and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None if verbose and not self._options_.get('verbose', False): raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__) parms = self._parms.copy() names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if self.supervised_learning: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None self._estimator_type = "unsupervised" if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if len(x) > 0 and is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()} if "r2" in (parms.get('stopping_metric') or []): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") return parms
def start(jar_path=None, nthreads=-1, enable_assertions=True, max_mem_size=None, min_mem_size=None, ice_root=None, log_dir=None, log_level=None, max_log_file_size=None, port="54321+", name=None, extra_classpath=None, verbose=True, jvm_custom_args=None, bind_to_localhost=True): """ Start new H2O server on the local machine. :param jar_path: Path to the h2o.jar executable. If not given, then we will search for h2o.jar in the locations returned by `._jar_paths()`. :param nthreads: Number of threads in the thread pool. This should be related to the number of CPUs used. -1 means use all CPUs on the host. A positive integer specifies the number of CPUs directly. :param enable_assertions: If True, pass `-ea` option to the JVM. :param max_mem_size: Maximum heap size (jvm option Xmx), in bytes. :param min_mem_size: Minimum heap size (jvm option Xms), in bytes. :param log_dir: Directory for H2O logs to be stored if a new instance is started. Default directory is determined by H2O internally. :param log_level: The logger level for H2O if a new instance is started. :param max_log_file_size: Maximum size of INFO and DEBUG log files. The file is rolled over after a specified size has been reached. (The default is 3MB. Minimum is 1MB and maximum is 99999MB) :param ice_root: A directory where H2O stores its temporary files. Default location is determined by tempfile.mkdtemp(). :param port: Port where to start the new server. This could be either an integer, or a string of the form "DDDDD+", indicating that the server should start looking for an open port starting from DDDDD and up. :param name: name of the h2o cluster to be started :param extra_classpath: List of paths to libraries that should be included on the Java classpath. :param verbose: If True, then connection info will be printed to the stdout. :param jvm_custom_args: Custom, user-defined arguments for the JVM H2O is instantiated in :param bind_to_localhost: A flag indicating whether access to the H2O instance should be restricted to the local machine (default) or if it can be reached from other computers on the network. Only applicable when H2O is started from the Python client. :returns: a new H2OLocalServer instance """ assert_is_type(jar_path, None, str) assert_is_type(port, None, int, str) assert_is_type(name, None, str) assert_is_type(nthreads, -1, BoundInt(1, 4096)) assert_is_type(enable_assertions, bool) assert_is_type(min_mem_size, None, int) assert_is_type(max_mem_size, None, BoundInt(1 << 25)) assert_is_type(log_dir, str, None) assert_is_type(log_level, str, None) assert_satisfies( log_level, log_level in [None, "TRACE", "DEBUG", "INFO", "WARN", "ERRR", "FATA"]) assert_is_type(max_log_file_size, str, None) assert_is_type(ice_root, None, I(str, os.path.isdir)) assert_is_type(extra_classpath, None, [str]) assert_is_type(jvm_custom_args, list, None) assert_is_type(bind_to_localhost, bool) if jar_path: assert_satisfies(jar_path, jar_path.endswith("h2o.jar")) if min_mem_size is not None and max_mem_size is not None and min_mem_size > max_mem_size: raise H2OValueError( "`min_mem_size`=%d is larger than the `max_mem_size`=%d" % (min_mem_size, max_mem_size)) if port is None: port = "54321+" baseport = None # TODO: get rid of this port gimmick and have 2 separate parameters. if is_type(port, str): if port.isdigit(): port = int(port) else: if not (port[-1] == "+" and port[:-1].isdigit()): raise H2OValueError( "`port` should be of the form 'DDDD+', where D is a digit. Got: %s" % port) baseport = int(port[:-1]) port = 0 hs = H2OLocalServer() hs._verbose = bool(verbose) hs._jar_path = hs._find_jar(jar_path) hs._extra_classpath = extra_classpath hs._ice_root = ice_root hs._name = name if not ice_root: hs._ice_root = tempfile.mkdtemp() hs._tempdir = hs._ice_root if verbose: print("Attempting to start a local H2O server...") hs._launch_server(port=port, baseport=baseport, nthreads=int(nthreads), ea=enable_assertions, mmax=max_mem_size, mmin=min_mem_size, jvm_custom_args=jvm_custom_args, bind_to_localhost=bind_to_localhost, log_dir=log_dir, log_level=log_level, max_log_file_size=max_log_file_size) if verbose: print(" Server is running at %s://%s:%d" % (hs.scheme, hs.ip, hs.port)) atexit.register(lambda: hs.shutdown()) return hs
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _process_response(response, save_to): """ Given a response object, prepare it to be handed over to the external caller. Preparation steps include: * detect if the response has error status, and convert it to an appropriate exception; * detect Content-Type, and based on that either parse the response as JSON or return as plain text. """ status_code = response.status_code if status_code == 200 and save_to: if save_to.startswith("~"): save_to = os.path.expanduser(save_to) if os.path.isdir(save_to) or save_to.endswith(os.path.sep): dirname = os.path.join(os.path.abspath(save_to), '') filename = H2OConnection._find_file_name(response) else: dirname, filename = os.path.split(os.path.abspath(save_to)) fullname = os.path.join(dirname, filename) try: if not os.path.exists(dirname): os.makedirs(dirname) with open(fullname, "wb") as f: for chunk in response.iter_content(chunk_size=65536): if chunk: # Empty chunks may occasionally happen f.write(chunk) except OSError as e: raise H2OValueError("Cannot write to file %s: %s" % (fullname, e)) return fullname content_type = response.headers.get("Content-Type", "") if ";" in content_type: # Remove a ";charset=..." part content_type = content_type[:content_type.index(";")] # this is needed so that response.text() works correctly response.encoding = response.headers.get("Character-Encoding", response.encoding) # Auto-detect response type by its content-type. Decode JSON, all other responses pass as-is. if content_type == "application/json": try: data = response.json(object_pairs_hook=H2OResponse) except (JSONDecodeError, requests.exceptions.ContentDecodingError) as e: raise H2OServerError("Malformed JSON from server (%s):\n%s" % (str(e), response.text)) else: data = response.text # Success (200 = "Ok", 201 = "Created", 202 = "Accepted", 204 = "No Content") if status_code in {200, 201, 202, 204}: return data # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed") if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3): data.show_stacktrace = False raise H2OResponseError(data) # Server errors (notably 500 = "Server Error") # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server # did not provide the correct status code. raise H2OServerError("HTTP %d %s:\n%s" % (status_code, response.reason, data))
def train(self, x = None, y = None, training_frame = None, fold_column = None, weights_column = None, validation_frame = None, leaderboard_frame = None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data to be scored on while training. Optional. This frame is used early stopping of individual models and early stopping of the grid searches (unless max_models or max_runtime_secs overrides metric-based early stopping). :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ ncols = training_frame.ncols names = training_frame.names #Set project name if None if self.project_name is None: self.project_name = "automl_" + training_frame.frame_id self.build_control["project_name"] = self.project_name # Minimal required arguments are training_frame and y (response) if y is None: raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None: ignored_columns.remove(fold_column) if weights_column is not None: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec = input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._job.poll() self._fetch()
def _verify_training_frame_params(self, *args): for param in args: if param is not None: raise H2OValueError("No training frame defined, yet the parameter %d is has been specified.", param)
def train(self, x=None, y=None, training_frame=None): """ Train the rulefit model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit """ if (training_frame.type(y) == "enum"): if training_frame[y].unique().nrow > 2: family = "multinomial" raise H2OValueError("multinomial use cases not yet supported") else: family = "binomial" else: if self.glm_params.get("family") is not None: family = self.glm_params.get("family") self.glm_params.pop("family") else: family = "gaussian" # Get paths from random forest models paths_frame = training_frame[y] depths = range(self.min_rule_len, self.max_rule_len + 1) tree_models = dict() for model_idx in range(len(depths)): # Train tree models tree_model = _tree_model(self.algorithm, depths[model_idx], self.seed, model_idx, self.tree_params) tree_model.train(y=y, x=x, training_frame=training_frame) tree_models[model_idx] = tree_model paths = tree_model.predict_leaf_node_assignment(training_frame) paths.col_names = [ "tree_{0}.{1}".format(str(model_idx), x) for x in paths.col_names ] paths_frame = paths_frame.cbind(paths) if self.max_num_rules: # Train GLM with chosen lambda glm = H2OGeneralizedLinearEstimator( model_id="glm.hex", seed=self.seed, family=family, alpha=1, max_active_predictors=self.max_num_rules + 1, **self.glm_params) glm.train(y=y, training_frame=paths_frame) else: # Get optimal lambda glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", nfolds=self.nfolds, seed=self.seed, family=family, alpha=1, lambda_search=True, **self.glm_params) glm.train(y=y, training_frame=paths_frame) lambda_ = _get_glm_lambda(glm) # Train GLM with chosen lambda glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", seed=self.seed, family=family, alpha=1, lambda_=lambda_, solver="COORDINATE_DESCENT", **self.glm_params) glm.train(y=y, training_frame=paths_frame) # Get Intercept intercept = _get_intercept(glm) # Get Rules rule_importance = _get_rules(glm, tree_models, self.algorithm) self.intercept = intercept self.rule_importance = rule_importance self.glm = glm self.tree_models = tree_models