def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family): # GLM airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) glm = H2OGeneralizedLinearEstimator(nfolds = 3, family = family, alpha = 1, lambda_ = 1) glm.train(x = x, y = y, training_frame=airlines, validation_frame=airlines, ) print(glm) with Capturing() as original_output: glm.show() original_model_filename = tempfile.mkdtemp() original_model_filename = glm.download_mojo(original_model_filename) generic_mojo_model_from_file = H2OGenericEstimator.from_file(original_model_filename) assert generic_mojo_model_from_file is not None print(generic_mojo_model_from_file) compare_params(glm, generic_mojo_model_from_file) with Capturing() as generic_output: generic_mojo_model_from_file.show() output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name) predictions = generic_mojo_model_from_file.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model_from_file._model_json["output"]["model_summary"] is not None assert len(generic_mojo_model_from_file._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo"); generic_mojo_filename = generic_mojo_model_from_file.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
def mojo_model_glm_test(): # GLM airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) glm = H2OGeneralizedLinearEstimator(nfolds=3) glm.train(x=["Origin", "Dest"], y="Distance", training_frame=airlines, validation_frame=airlines) original_model_filename = tempfile.mkdtemp() original_model_filename = glm.download_mojo(original_model_filename) model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None print(model) predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def test_big_data_cars(): """ Test big data dataset, with metric logloss. """ h2o_df = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/lending-club/loan.csv")) predictors = h2o_df.col_names response_col = h2o_df.col_names[12] # loan amount predictors.remove(response_col) model = H2OGeneralizedLinearEstimator(family="binomial") model.train(y=response_col, x=predictors, training_frame=h2o_df) metric = "logloss" pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=-1, metric=metric) for pred in predictors: if pred == "Variable": continue assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float) # Relative PFI pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=100, metric=metric) for pred in predictors: if pred == "Variable": continue assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float) # Relative PFI
def bake(self) -> H2OGeneralizedLinearEstimator: fr = stars_frame() assert fr.type("distance") == "int" model = H2OGeneralizedLinearEstimator() model.train(y="distance", training_frame=fr, ignored_columns=["name1", "name2"]) return model
def bake(self) -> H2OGeneralizedLinearEstimator: fr = names_frame() fr = fr[:5000, :] fr["name"] = fr["name"].ascharacter().asfactor() # trim nlevels() assert 256 < fr["name"].nlevels()[0] < 500 model = H2OGeneralizedLinearEstimator() model.train(y="sex", training_frame=fr) return model
def test_glm_params(): H2OGeneralizedLinearEstimator() H2OGeneralizedLinearEstimator(nfolds=5, seed=1000, alpha=0.5) df = h2o.H2OFrame.from_python({ "response": [1, 2, 3, 4, 5], "a": [0, 1, 0, 1, 0], "b": [-1, 3, 7, 11, 20], "n": [0] * 5, "w": [1] * 5 }) model = H2OGeneralizedLinearEstimator() model.training_frame = df model.validation_frame = df model.nfolds = 3 model.keep_cross_validation_predictions = True model.keep_cross_validation_fold_assignment = True model.fold_assignment = "random" model.fold_column = "b" model.response_column = "response" model.ignored_columns = ["x", "y"] model.ignore_const_cols = True model.score_each_iteration = True model.offset_column = "n" model.weights_column = "w" model.family = "MultiNomial" model.family = "GAUSSIAN" model.family = "Twee-die" model.family = "'poIssoN'" model.tweedie_variance_power = 1 model.tweedie_link_power = 2 model.solver = "CoordinateDescentNaive" try: model.fold_assignment = "pseudo-random" assert False except H2OTypeError: pass try: model.ignored_columns = "c" assert False except H2OTypeError: pass
def model(train, test): today = datetime.datetime.today().today().strftime('%Y-%m-%d:%H:%M') from h2o.estimators import H2OGeneralizedLinearEstimator h2o_train = h2o.H2OFrame(train) h2o_test = h2o.H2OFrame(test) predictor_columns = [ c for c in h2o_train.drop('Wait_Time').col_names if c not in 'Unit' ] response_column = 'Wait_Time' h2o_train[predictor_columns] = h2o_train[predictor_columns].asfactor() h2o_test[predictor_columns] = h2o_test[predictor_columns].asfactor() # train, valid = h2o_train.split_frame([.99],seed=615) glm_model = H2OGeneralizedLinearEstimator( family='Gamma', #Gaussian , Gamma lambda_=0, alpha=0, compute_p_values=True, remove_collinear_columns=True, seed=615, fold_assignment="Modulo", ### "Modulo" keep_cross_validation_predictions=True, nfolds=7) glm_model.train(predictor_columns, response_column, training_frame=h2o_train, validation_frame=h2o_test) glm_model.model_performance(h2o_train) glm_model.model_performance(h2o_test) prediction = glm_model.predict(h2o_test).as_data_frame() prediction['pred_min'] = (prediction.predict / 60) * 10 prediction['StdErr_min'] = (prediction.StdErr / 60) pred_table = test[['Unit', 'Week']].merge(prediction, how='outer', left_index=True, right_index=True) coef_table = glm_model._model_json['output'][ 'coefficients_table'].as_data_frame() pred_table[pred_table.Unit == 'Essex'] coef_table.to_csv('/home/mark/Desktop/IB_docs/coef_table' + today + '.csv', index=False) pred_table.to_csv('/home/mark/Desktop/IB_docs/pred_table' + today + '.csv', index=False) return
def _get_glm_lambda(glm): """ Get the best GLM lambda by choosing one diminishing returns on explained deviance """ r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm) deviance = r.get('explained_deviance_train') rule_count = [len([k for k,v in x.items() if abs(v) > 0 and k != "Intercept"]) for x in r.get('coefficients')] lambda_index = [i*3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0][0] return r.get('lambdas')[lambda_index]
def _get_glm_coeffs(glm): """ Get the GLM coefficients by choosing the lambda with diminishing returns on explained deviance """ r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm) deviance = r.get('explained_deviance_train') inflection_pt = [i*3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0][0] intercept = {k: v for k,v in r.get('coefficients')[inflection_pt].items() if k == "Intercept"} coeffs = {k: v for k,v in r.get('coefficients')[inflection_pt].items() if abs(v) > 0 and k != "Intercept"} return intercept, coeffs
def demo_body(go): """ Demo of H2O's Generalized Linear Estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.upload_file(data_file("h2o_data/prostate.csv")) go() # Print a description of the prostate data prostate.summary() go() # Randomly split the dataset into ~70/30, training/test sets r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_glm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_glm.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_glm.model_performance(test) performance.show()
def test_GLM_throws_ArrayOutOfBoundException(): nFold = 5 fr = h2o.import_file( pyunit_utils.locate("bigdata/laptop/jira/christine.arff")) splitFrame = fr.split_frame(ratios=[0.05]) glm = H2OGeneralizedLinearEstimator(family='binomial', nfolds=nFold, lambda_search=True, alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) glm.train(y=0, training_frame=splitFrame[0]) assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \ "expected number of cross_validation_model: {0}. Actual number of cross_validation: " \ "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)
def _get_glm_lambda(glm): """ Get the best GLM lambda by choosing one diminishing returns on explained deviance """ r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm) deviance = r.get('explained_deviance_train') if len(deviance) < 5: lambda_index = len(deviance) - 1 else: lambda_index = [ i * 3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0 ][0] return r.get('lambdas')[lambda_index]
def pubdev_5265(): training_data = { 'response': [ 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C' ], 'explanatory': ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3] } test_data = { 'response': [ 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C' ], 'explanatory': ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4] } training_data = h2o.H2OFrame(training_data) training_data['explanatory'] = training_data['explanatory'].asfactor() test_data = h2o.H2OFrame(test_data) test_data['explanatory'] = test_data['explanatory'].asfactor() glm_estimator = H2OGeneralizedLinearEstimator( family="multinomial", missing_values_handling="MeanImputation", seed=1234, Lambda=0) glm_estimator.train(x=["explanatory"], y="response", training_frame=training_data) # Training on the given dataset should not fail if there is a missing categorical variable (present in training dataset) with warnings.catch_warnings(record=True) as w: grouped_occurances = glm_estimator.predict(test_data=test_data).group_by((0)).count().get_frame() \ .as_data_frame() assert "Test/Validation dataset column 'explanatory' has levels not trained on: [4]" in str( w[-1].message) # The very first value corresponding to 'A' in the explanatory variable column should be replaced by the mode value, which is 3. # As a result, 8 occurances of type C should be predicted grouped_occurances.as_matrix().tolist() == [['A', 4], ['B', 6], ['C', 8]]
def main(): h2o.init() #df = h2o.import_file(path="smalldata/logreg/prostate.csv") prostate = h2o.load_dataset("prostate") prostate.describe() train, test = prostate.split_frame(ratios=[0.70]) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() # Train model from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) prostate_glm.show() predictions = prostate_glm.predict(test) predictions.show() performance = prostate_glm.model_performance(test) performance.show() # Export model model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True) print(model_path) model = prostate_glm predictions = model.predict(test) predictions.show() performance = model.model_performance(test) performance.show() # Export test data df = test.as_data_frame() with open("data.json", "w") as f: #json.dump(df.to_json(orient='records'), f) #json.dump(df.to_json(orient='columns'), f) json.dump(df.to_json(orient='index'), f)
def test_GLM_throws_ArrayOutOfBoundException(): # everything in this test is important to cause the exception: # - GLEASON as a categorical # - lambda search enabled # - alphas # - CV enabled df = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) target = "CAPSULE" nFold = 5 for col in [target, 'GLEASON']: df[col] = df[col].asfactor() glm = H2OGeneralizedLinearEstimator( lambda_search=True, alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], nfolds=nFold, seed=12345) glm.train(y=target, training_frame=df) assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \ "expected number of cross_validation_model: {0}. Actual number of cross_validation: " \ "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)
def _get_glm_lambda(glm, num_rules): """ Get the best GLM lambda by choosing one diminishing returns on explained deviance :param num_rules: The number of rules to use in rulefit model. """ r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm) deviance = r.get('explained_deviance_train') rule_count = [ len([k for k, v in x.items() if abs(v) > 0 and k != "Intercept"]) for x in r.get('coefficients') ] if num_rules is None: lambda_index = [ i * 3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0 ][0] else: lambda_index = [ x for x, val in enumerate(rule_count) if val > num_rules ][0] return r.get('lambdas')[lambda_index]
def construct_model(self): if(self.model_type=='C'): if(self.index==1): p_model=H2OGeneralizedLinearEstimator(**self.parameters) if(self.index==2): p_model=DecisionTreeClassifier(**self.parameters) if(self.index==3): p_model=GaussianNB(**self.parameters) if(self.index==4): p_model=SVC(**self.parameters) if(self.index==5): p_model=RandomForestClassifier(**self.parameters) if(self.index==6): p_model=GradientBoostingClassifier(**self.paraemters) if(self.index==7): p_model=ExtraTreesClassifier(**self.parameters) if(self.index==8): p_model=SGDClassifier(**self.parameters) else: if(self.index==1): p_model=LinearRegression(**self.parameters) if(self.index==2): p_model=DecisionTreeClassifier(**self.parameters) if(self.index==3): p_model=BayesianRidge(**self.parameters) if(self.index==4): p_model=SVR(**self.parameters) if(self.index==5): p_model=RandomForestRegressor(**self.parameters) if(self.index==6): p_model=GradientBoostingRegressor(**self.parameters) if(self.index==7): p_model=ExtraTreesRegressor(**self.parameters) if(self.index==8): p_model=SGDRegressor(**self.parameters) return p_model
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the rulefit model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit """ family = "gaussian" if (training_frame.type(y) == "enum"): if training_frame[y].unique().nrow > 2: family = "multinomial" else: family = "binomial" # Get paths from random forest models paths_frame = training_frame[y] depths = range(self.min_depth, self.max_depth + 1) rf_models = dict() for model_idx in range(len(depths)): # Train random forest models rf_model = H2ORandomForestEstimator(seed=self.seed, model_id="rf_{}.hex".format( str(model_idx)), max_depth=depths[model_idx]) rf_model.train(y=y, x=x, training_frame=training_frame) rf_models[model_idx] = rf_model paths = rf_model.predict_leaf_node_assignment(training_frame) paths.col_names = [ "rf_{0}.{1}".format(str(model_idx), x) for x in paths.col_names ] paths_frame = paths_frame.cbind(paths) # Extract important paths glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", nfolds=self.nfolds, seed=self.seed, family=family, alpha=1, remove_collinear_columns=True, lambda_search=True) glm.train(y=y, training_frame=paths_frame) lambda_ = _get_glm_lambda(glm, self.num_rules) # Train GLM with chosen lambda glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", seed=self.seed, family=family, alpha=1, remove_collinear_columns=True, lambda_=lambda_, solver="COORDINATE_DESCENT") glm.train(y=y, training_frame=paths_frame) # Get Intercept intercept = _get_intercept(glm) # Get Rules rule_importance = _get_rules(glm, rf_models) self.intercept = intercept self.rule_importance = rule_importance self.glm = glm self.rf_models = rf_models
testing_frame = ProcessData.testData(moving_average=True, standard_deviation=True, probability_from_file=True) # create h2o frames train = h2o.H2OFrame(training_frame) test = h2o.H2OFrame(testing_frame) train.set_names(list(training_frame.columns)) test.set_names(list(testing_frame.columns)) # Feature selection training_columns = list(training_frame.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time") # Build model model4 = H2OGeneralizedLinearEstimator() # Train model model4.train(x=training_columns, y=response_column, training_frame=train) # End : Generalized Linear Modeling # ---------------------------------------------------------------------------------------------------------------------- # Prediction # ---------------------------------------------------------------------------------------------------------------------- print "Begin Prdiction" print "---------------" # ground truth tY = np.array(testing_frame['RUL'])
def bake(self) -> H2OGeneralizedLinearEstimator: fr = missing_frame() model = H2OGeneralizedLinearEstimator() model.train(training_frame=fr) return model
def bake(self) -> H2OGeneralizedLinearEstimator: fr = eyestate_frame() model = H2OGeneralizedLinearEstimator() model.train(y="eyeDetection", training_frame=fr) return model
def train(self, x=None, y=None, training_frame=None): """ Train the rulefit model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit """ if (training_frame.type(y) == "enum"): if training_frame[y].unique().nrow > 2: family = "multinomial" raise H2OValueError("multinomial use cases not yet supported") else: family = "binomial" else: if self.glm_params.get("family") is not None: family = self.glm_params.get("family") self.glm_params.pop("family") else: family = "gaussian" # Get paths from random forest models paths_frame = training_frame[y] depths = range(self.min_rule_len, self.max_rule_len + 1) tree_models = dict() for model_idx in range(len(depths)): # Train tree models tree_model = _tree_model(self.algorithm, depths[model_idx], self.seed, model_idx, self.tree_params) tree_model.train(y=y, x=x, training_frame=training_frame) tree_models[model_idx] = tree_model paths = tree_model.predict_leaf_node_assignment(training_frame) paths.col_names = [ "tree_{0}.{1}".format(str(model_idx), x) for x in paths.col_names ] paths_frame = paths_frame.cbind(paths) if self.max_num_rules: # Train GLM with chosen lambda glm = H2OGeneralizedLinearEstimator( model_id="glm.hex", seed=self.seed, family=family, alpha=1, max_active_predictors=self.max_num_rules + 1, **self.glm_params) glm.train(y=y, training_frame=paths_frame) else: # Get optimal lambda glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", nfolds=self.nfolds, seed=self.seed, family=family, alpha=1, lambda_search=True, **self.glm_params) glm.train(y=y, training_frame=paths_frame) lambda_ = _get_glm_lambda(glm) # Train GLM with chosen lambda glm = H2OGeneralizedLinearEstimator(model_id="glm.hex", seed=self.seed, family=family, alpha=1, lambda_=lambda_, solver="COORDINATE_DESCENT", **self.glm_params) glm.train(y=y, training_frame=paths_frame) # Get Intercept intercept = _get_intercept(glm) # Get Rules rule_importance = _get_rules(glm, tree_models, self.algorithm) self.intercept = intercept self.rule_importance = rule_importance self.glm = glm self.tree_models = tree_models
def generate_and_import_combined_pojo(): if sys.version_info[0] < 3: # Python 2 print("This example needs Python 3.x+") return weather_orig = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/weather.csv")) weather = weather_orig # working copy features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"}) features.sort() response = "RISK_MM" glm_model = H2OGeneralizedLinearEstimator() glm_model.train(x=features, y=response, training_frame=weather) glm_preds = glm_model.predict(weather) gbm_model = H2OGradientBoostingEstimator(ntrees=5) gbm_model.train(x=features, y=response, training_frame=weather) gbm_preds = gbm_model.predict(weather) # Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same) weather = weather.drop("ChangeTemp") weather = weather.drop("ChangeTempDir") combined_pojo_path = generate_combined_pojo(glm_model, gbm_model) print("Combined POJO was stored in: " + combined_pojo_path) # FIXME: https://h2oai.atlassian.net/browse/PUBDEV-8561 We need to make this work for upload_mojo as well pojo_model = h2o.import_mojo(combined_pojo_path) # Testing begins # Sanity test - test parameterization that delegates to GLM weather["Bias"] = 1 # behave like GLM pojo_glm_preds = pojo_model.predict(weather) assert_frame_equal(pojo_glm_preds.as_data_frame(), glm_preds.as_data_frame()) # Sanity test - test parameterization that delegates to GBM weather["Bias"] = 0 # behave like GBM pojo_gbm_preds = pojo_model.predict(weather) assert_frame_equal(pojo_gbm_preds.as_data_frame(), gbm_preds.as_data_frame()) # Test per-segment specific behavior, segments are defined by ChangeWindDirect weather["Bias"] = float("NaN") for change_wind_dir in weather["ChangeWindDirect"].levels()[0]: weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir] weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] == change_wind_dir] pojo_weather_cwd_preds = pojo_model.predict(weather_cwd) if change_wind_dir == "c" or change_wind_dir == "l": expected = glm_model.predict(weather_orig_cwd) * 2 assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame()) elif change_wind_dir == "n": expected = (glm_model.predict(weather_orig_cwd) + gbm_model.predict(weather_orig_cwd)) / 2 assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame()) elif change_wind_dir == "s": expected = gbm_model.predict(weather_orig_cwd) assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
def bake(self) -> H2OGeneralizedLinearEstimator: fr = titanic_frame() fr["parch"] = fr["parch"].asfactor() model = H2OGeneralizedLinearEstimator() model.train(y="parch", training_frame=fr, ignored_columns=["name", "ticket", "boat", "home.dest"]) return model
def bake(self) -> H2OGeneralizedLinearEstimator: fr = iris_frame() model = H2OGeneralizedLinearEstimator() model.train(y="Species", training_frame=fr) return model
def bake(self) -> H2OGeneralizedLinearEstimator: fr = cars_frame() model = H2OGeneralizedLinearEstimator() model.train(y="mpg", training_frame=fr, ignored_columns=["name"]) return model
response_column = Dataset.RESPONSE_COLUMN input_columns.remove('city') # Start h2o server h2o.init() # Create h2o frame training_frame = h2o.H2OFrame(pd_train) training_frame.set_names(list(pd_train.columns)) # Measurements mae = [] # Mean Absolute Errors for model rmse = [] # Root Mean Squared Errors for model for i in range(n_iterations): model = H2OGeneralizedLinearEstimator(nfolds=10) model.train(x=input_columns, y=response_column, training_frame=training_frame) mae.append(model.mae()) rmse.append(model.rmse()) print("Model : Single") print("--------------") print("Average MAE : " + str(numpy.average(mae))) print("Average RMSE : " + str(numpy.average(rmse))) print("MAE Standard Dev : " + str(numpy.std(mae))) print("RMSE Standard Dev : " + str(numpy.std(rmse)))
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, **params): """ Train the rulfit model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit """ family = "gaussian" if (training_frame.type(y) == "enum"): if training_frame[y].unique().nrow > 2: raise H2OValueError("Multinomial not supported") else: family = "binomial" # Get paths from random forest models paths_frame = training_frame[y] depths = range(self.min_depth, self.max_depth + 1) rf_models = [] for model_idx in range(len(depths)): # Train random forest models rf_model = H2ORandomForestEstimator(seed = self.seed, model_id = "rf.hex", max_depth = depths[model_idx]) rf_model.train(y = y, x = x, training_frame = training_frame) rf_models = rf_models + [rf_model] paths = rf_model.predict_leaf_node_assignment(training_frame) paths.col_names = ["rf_" + str(model_idx) +"."+ x for x in paths.col_names] paths_frame = paths_frame.cbind(paths) # Extract important paths glm = H2OGeneralizedLinearEstimator(model_id = "glm.hex", nfolds = self.nfolds, seed = self.seed, family = family, alpha = 1, remove_collinear_columns=True, lambda_search = True) glm.train(y = y, training_frame=paths_frame) intercept, rule_importance = _get_glm_coeffs(glm) rule_importance = pd.DataFrame.from_dict(rule_importance, orient = "index").reset_index() rule_importance.columns = ["variable", "coefficient"] # Convert paths to rules rules = [] for i in rule_importance.variable: if family == "binomial": model_num, tree_num, path = i.replace("rf_", "").replace("T", "").replace("C1.", "").split(".") else: model_num, tree_num, path = i.replace("rf_", "").replace("T", "").split(".") tree = H2OTree(rf_models[int(model_num)], int(tree_num)-1) rules = rules + [_tree_traverser(tree.root_node, path)] # Add rules and order by absolute coefficient rule_importance["rule"] = rules rule_importance["abs_coefficient"] = rule_importance["coefficient"].abs() rule_importance = rule_importance.loc[rule_importance.groupby(["rule"])["abs_coefficient"].idxmax()] rule_importance = rule_importance.sort_values(by = "abs_coefficient", ascending = False) rule_importance = rule_importance.drop("abs_coefficient", axis = 1) self.intercept = intercept self.rule_importance = rule_importance