Python H2OGeneralizedLinearEstimator示例，h2o.estimators.H2OGeneralizedLinearEstimator Python示例

示例#1

0

显示文件

def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family):

    # GLM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator(nfolds = 3, family = family, alpha = 1, lambda_ = 1)
    glm.train(x = x, y = y, training_frame=airlines, validation_frame=airlines, )
    print(glm)
    with Capturing() as original_output:
        glm.show()
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    generic_mojo_model_from_file = H2OGenericEstimator.from_file(original_model_filename)
    assert generic_mojo_model_from_file is not None
    print(generic_mojo_model_from_file)
    compare_params(glm, generic_mojo_model_from_file)
    with Capturing() as generic_output:
        generic_mojo_model_from_file.show()

    output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = generic_mojo_model_from_file.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model_from_file._model_json["output"]["model_summary"] is not None
    assert len(generic_mojo_model_from_file._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)

示例#2

0

显示文件

def mojo_model_glm_test():

    # GLM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator(nfolds=3)
    glm.train(x=["Origin", "Dest"],
              y="Distance",
              training_frame=airlines,
              validation_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    print(model)
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)

示例#3

0

显示文件

文件： pyunit_pubdev_7139_permutation_var_imp.py 项目： wwjiang007/h2o-3

def test_big_data_cars():
    """
    Test big data dataset, with metric logloss. 
    """
    h2o_df = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/lending-club/loan.csv"))
    predictors = h2o_df.col_names
    response_col = h2o_df.col_names[12]  # loan amount
    predictors.remove(response_col)

    model = H2OGeneralizedLinearEstimator(family="binomial")
    model.train(y=response_col, x=predictors, training_frame=h2o_df)

    metric = "logloss"

    pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=-1, metric=metric)
    for pred in predictors:
        if pred == "Variable":
            continue
        assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float)  # Relative PFI

    pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=100, metric=metric)
    for pred in predictors:
        if pred == "Variable":
            continue
        assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float)  # Relative PFI

示例#4

0

显示文件

文件： stars_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = stars_frame()
     assert fr.type("distance") == "int"
     model = H2OGeneralizedLinearEstimator()
     model.train(y="distance",
                 training_frame=fr,
                 ignored_columns=["name1", "name2"])
     return model

示例#5

0

显示文件

文件： names_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = names_frame()
     fr = fr[:5000, :]
     fr["name"] = fr["name"].ascharacter().asfactor()  # trim nlevels()
     assert 256 < fr["name"].nlevels()[0] < 500
     model = H2OGeneralizedLinearEstimator()
     model.train(y="sex", training_frame=fr)
     return model

示例#6

0

显示文件

文件： pyunit_glm_parameters.py 项目： Kendralabs/h2o-4

def test_glm_params():
    H2OGeneralizedLinearEstimator()
    H2OGeneralizedLinearEstimator(nfolds=5, seed=1000, alpha=0.5)

    df = h2o.H2OFrame.from_python({
        "response": [1, 2, 3, 4, 5],
        "a": [0, 1, 0, 1, 0],
        "b": [-1, 3, 7, 11, 20],
        "n": [0] * 5,
        "w": [1] * 5
    })

    model = H2OGeneralizedLinearEstimator()
    model.training_frame = df
    model.validation_frame = df
    model.nfolds = 3
    model.keep_cross_validation_predictions = True
    model.keep_cross_validation_fold_assignment = True
    model.fold_assignment = "random"
    model.fold_column = "b"
    model.response_column = "response"
    model.ignored_columns = ["x", "y"]
    model.ignore_const_cols = True
    model.score_each_iteration = True
    model.offset_column = "n"
    model.weights_column = "w"
    model.family = "MultiNomial"
    model.family = "GAUSSIAN"
    model.family = "Twee-die"
    model.family = "'poIssoN'"
    model.tweedie_variance_power = 1
    model.tweedie_link_power = 2
    model.solver = "CoordinateDescentNaive"

    try:
        model.fold_assignment = "pseudo-random"
        assert False
    except H2OTypeError:
        pass

    try:
        model.ignored_columns = "c"
        assert False
    except H2OTypeError:
        pass

示例#7

0

显示文件

def model(train, test):

    today = datetime.datetime.today().today().strftime('%Y-%m-%d:%H:%M')

    from h2o.estimators import H2OGeneralizedLinearEstimator

    h2o_train = h2o.H2OFrame(train)
    h2o_test = h2o.H2OFrame(test)

    predictor_columns = [
        c for c in h2o_train.drop('Wait_Time').col_names if c not in 'Unit'
    ]
    response_column = 'Wait_Time'

    h2o_train[predictor_columns] = h2o_train[predictor_columns].asfactor()
    h2o_test[predictor_columns] = h2o_test[predictor_columns].asfactor()

    #   train, valid = h2o_train.split_frame([.99],seed=615)

    glm_model = H2OGeneralizedLinearEstimator(
        family='Gamma',  #Gaussian , Gamma
        lambda_=0,
        alpha=0,
        compute_p_values=True,
        remove_collinear_columns=True,
        seed=615,
        fold_assignment="Modulo",  ### "Modulo"
        keep_cross_validation_predictions=True,
        nfolds=7)

    glm_model.train(predictor_columns,
                    response_column,
                    training_frame=h2o_train,
                    validation_frame=h2o_test)

    glm_model.model_performance(h2o_train)
    glm_model.model_performance(h2o_test)

    prediction = glm_model.predict(h2o_test).as_data_frame()
    prediction['pred_min'] = (prediction.predict / 60) * 10
    prediction['StdErr_min'] = (prediction.StdErr / 60)
    pred_table = test[['Unit', 'Week']].merge(prediction,
                                              how='outer',
                                              left_index=True,
                                              right_index=True)

    coef_table = glm_model._model_json['output'][
        'coefficients_table'].as_data_frame()

    pred_table[pred_table.Unit == 'Essex']

    coef_table.to_csv('/home/mark/Desktop/IB_docs/coef_table' + today + '.csv',
                      index=False)
    pred_table.to_csv('/home/mark/Desktop/IB_docs/pred_table' + today + '.csv',
                      index=False)
    return

示例#8

0

显示文件

def _get_glm_lambda(glm):
    """
    Get the best GLM lambda by choosing one diminishing returns on explained deviance
    """
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm)
    deviance = r.get('explained_deviance_train')
    rule_count = [len([k for k,v in x.items() if abs(v) > 0 and k != "Intercept"]) for x in r.get('coefficients')]
    lambda_index = [i*3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0][0]
        
    return r.get('lambdas')[lambda_index]

示例#9

0

显示文件

文件： rulefit.py 项目： Fahad021/h2o-tutorials

def _get_glm_coeffs(glm):
    """
    Get the GLM coefficients by choosing the lambda with diminishing returns on explained deviance
    """
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm)
    deviance = r.get('explained_deviance_train')
    inflection_pt = [i*3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2)))) if x != 0 and i > 0][0]
    intercept = {k: v for k,v in r.get('coefficients')[inflection_pt].items() if  k == "Intercept"}
    coeffs = {k: v for k,v in r.get('coefficients')[inflection_pt].items() if abs(v) > 0 and k != "Intercept"}
    return intercept, coeffs

示例#10

0

显示文件

    def demo_body(go):
        """
        Demo of H2O's Generalized Linear Estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.upload_file(data_file("h2o_data/prostate.csv"))

        go()
        # Print a description of the prostate data
        prostate.summary()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        r = prostate[0].runif()
        train = prostate[r < 0.70]
        test = prostate[r >= 0.70]

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGeneralizedLinearEstimator
        prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                     alpha=[0.5])
        prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE",
                           training_frame=train)

        go()
        # Show the model
        prostate_glm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_glm.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_glm.model_performance(test)
        performance.show()

示例#11

0

显示文件

def test_GLM_throws_ArrayOutOfBoundException():
    nFold = 5
    fr = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/jira/christine.arff"))
    splitFrame = fr.split_frame(ratios=[0.05])
    glm = H2OGeneralizedLinearEstimator(family='binomial',
                                        nfolds=nFold,
                                        lambda_search=True,
                                        alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
    glm.train(y=0, training_frame=splitFrame[0])
    assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \
        "expected number of cross_validation_model: {0}.  Actual number of cross_validation: " \
        "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)

示例#12

0

显示文件

def _get_glm_lambda(glm):
    """
    Get the best GLM lambda by choosing one diminishing returns on explained deviance
    """
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm)
    deviance = r.get('explained_deviance_train')
    if len(deviance) < 5:
        lambda_index = len(deviance) - 1
    else:
        lambda_index = [
            i * 3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2))))
            if x != 0 and i > 0
        ][0]

    return r.get('lambdas')[lambda_index]

示例#13

0

显示文件

文件： pyunit_pubdev_5265.py 项目： Kendralabs/h2o-4

def pubdev_5265():
    training_data = {
        'response': [
            'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C',
            'C', 'C', 'C', 'C', 'C'
        ],
        'explanatory':
        ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]
    }

    test_data = {
        'response': [
            'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C',
            'C', 'C', 'C', 'C', 'C'
        ],
        'explanatory':
        ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4]
    }

    training_data = h2o.H2OFrame(training_data)
    training_data['explanatory'] = training_data['explanatory'].asfactor()

    test_data = h2o.H2OFrame(test_data)
    test_data['explanatory'] = test_data['explanatory'].asfactor()

    glm_estimator = H2OGeneralizedLinearEstimator(
        family="multinomial",
        missing_values_handling="MeanImputation",
        seed=1234,
        Lambda=0)

    glm_estimator.train(x=["explanatory"],
                        y="response",
                        training_frame=training_data)

    # Training on the given dataset should not fail if there is a missing categorical variable (present in training dataset)
    with warnings.catch_warnings(record=True) as w:
        grouped_occurances = glm_estimator.predict(test_data=test_data).group_by((0)).count().get_frame() \
            .as_data_frame()
        assert "Test/Validation dataset column 'explanatory' has levels not trained on: [4]" in str(
            w[-1].message)

    # The very first value corresponding to 'A' in the explanatory variable column should be replaced by the mode value, which is 3.
    # As a result, 8 occurances of type C should be predicted
    grouped_occurances.as_matrix().tolist() == [['A', 4], ['B', 6], ['C', 8]]

示例#14

0

显示文件

文件： train_prostate_model.py 项目： cmftall/h2o_examples

def main():
    h2o.init()

    #df = h2o.import_file(path="smalldata/logreg/prostate.csv")
    prostate = h2o.load_dataset("prostate")
    prostate.describe()

    train, test = prostate.split_frame(ratios=[0.70])
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Train model
    from h2o.estimators import H2OGeneralizedLinearEstimator
    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                 alpha=[0.5])
    prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)
    prostate_glm.show()

    predictions = prostate_glm.predict(test)
    predictions.show()

    performance = prostate_glm.model_performance(test)
    performance.show()

    # Export model
    model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True)
    print(model_path)

    model = prostate_glm
    predictions = model.predict(test)
    predictions.show()

    performance = model.model_performance(test)
    performance.show()

    # Export test data
    df = test.as_data_frame()
    with open("data.json", "w") as f:
        #json.dump(df.to_json(orient='records'), f)
        #json.dump(df.to_json(orient='columns'), f)
        json.dump(df.to_json(orient='index'), f)

示例#15

0

显示文件

def test_GLM_throws_ArrayOutOfBoundException():
    # everything in this test is important to cause the exception:
    # - GLEASON as a categorical
    # - lambda search enabled
    # - alphas    # - CV enabled
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    nFold = 5
    for col in [target, 'GLEASON']:
        df[col] = df[col].asfactor()
        glm = H2OGeneralizedLinearEstimator(
            lambda_search=True,
            alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
            nfolds=nFold,
            seed=12345)
        glm.train(y=target, training_frame=df)

        assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \
            "expected number of cross_validation_model: {0}.  Actual number of cross_validation: " \
            "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)

示例#16

0

显示文件

文件： rulefit.py 项目： Jay4869/Data-Science

def _get_glm_lambda(glm, num_rules):
    """
    Get the best GLM lambda by choosing one diminishing returns on explained deviance
    :param num_rules: The number of rules to use in rulefit model.
    """
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(glm)
    deviance = r.get('explained_deviance_train')
    rule_count = [
        len([k for k, v in x.items() if abs(v) > 0 and k != "Intercept"])
        for x in r.get('coefficients')
    ]
    if num_rules is None:
        lambda_index = [
            i * 3 for i, x in enumerate(np.diff(np.sign(np.diff(deviance, 2))))
            if x != 0 and i > 0
        ][0]

    else:
        lambda_index = [
            x for x, val in enumerate(rule_count) if val > num_rules
        ][0]

    return r.get('lambdas')[lambda_index]

示例#17

0

显示文件

文件： ML ALgorithms Chooser.py 项目： coolsubbu/Data-Science-Algorithms

 def construct_model(self):
     
     if(self.model_type=='C'):
         if(self.index==1):
             p_model=H2OGeneralizedLinearEstimator(**self.parameters)
         if(self.index==2):
             p_model=DecisionTreeClassifier(**self.parameters)
         if(self.index==3):
             p_model=GaussianNB(**self.parameters)
         if(self.index==4):
             p_model=SVC(**self.parameters)
         if(self.index==5):
             p_model=RandomForestClassifier(**self.parameters)
         if(self.index==6):
             p_model=GradientBoostingClassifier(**self.paraemters)
         if(self.index==7):
             p_model=ExtraTreesClassifier(**self.parameters)
         if(self.index==8):
             p_model=SGDClassifier(**self.parameters)
     else:
         if(self.index==1):
             p_model=LinearRegression(**self.parameters)
         if(self.index==2):
             p_model=DecisionTreeClassifier(**self.parameters)
         if(self.index==3):
             p_model=BayesianRidge(**self.parameters)
         if(self.index==4):
             p_model=SVR(**self.parameters)
         if(self.index==5):
             p_model=RandomForestRegressor(**self.parameters)
         if(self.index==6):
             p_model=GradientBoostingRegressor(**self.parameters)
         if(self.index==7):
             p_model=ExtraTreesRegressor(**self.parameters)			 
         if(self.index==8):
             p_model=SGDRegressor(**self.parameters)
     return p_model

示例#18

0

显示文件

文件： rulefit.py 项目： Jay4869/Data-Science

    def train(self,
              x=None,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              **params):
        """
        Train the rulefit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """
        family = "gaussian"
        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                family = "multinomial"
            else:
                family = "binomial"

        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_depth, self.max_depth + 1)
        rf_models = dict()
        for model_idx in range(len(depths)):

            # Train random forest models
            rf_model = H2ORandomForestEstimator(seed=self.seed,
                                                model_id="rf_{}.hex".format(
                                                    str(model_idx)),
                                                max_depth=depths[model_idx])
            rf_model.train(y=y, x=x, training_frame=training_frame)
            rf_models[model_idx] = rf_model

            paths = rf_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = [
                "rf_{0}.{1}".format(str(model_idx), x) for x in paths.col_names
            ]
            paths_frame = paths_frame.cbind(paths)

        # Extract important paths
        glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                            nfolds=self.nfolds,
                                            seed=self.seed,
                                            family=family,
                                            alpha=1,
                                            remove_collinear_columns=True,
                                            lambda_search=True)
        glm.train(y=y, training_frame=paths_frame)

        lambda_ = _get_glm_lambda(glm, self.num_rules)

        # Train GLM with chosen lambda
        glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                            seed=self.seed,
                                            family=family,
                                            alpha=1,
                                            remove_collinear_columns=True,
                                            lambda_=lambda_,
                                            solver="COORDINATE_DESCENT")
        glm.train(y=y, training_frame=paths_frame)

        # Get Intercept
        intercept = _get_intercept(glm)

        # Get Rules
        rule_importance = _get_rules(glm, rf_models)

        self.intercept = intercept
        self.rule_importance = rule_importance
        self.glm = glm
        self.rf_models = rf_models

示例#19

0

显示文件

文件： MultipleModelPrediction1.py 项目： simonway/PythonML

testing_frame = ProcessData.testData(moving_average=True, standard_deviation=True, probability_from_file=True)

# create h2o frames
train = h2o.H2OFrame(training_frame)
test = h2o.H2OFrame(testing_frame)
train.set_names(list(training_frame.columns))
test.set_names(list(testing_frame.columns))

# Feature selection
training_columns = list(training_frame.columns)
training_columns.remove(response_column)
training_columns.remove("UnitNumber")
training_columns.remove("Time")

# Build model
model4 = H2OGeneralizedLinearEstimator()

# Train model
model4.train(x=training_columns, y=response_column, training_frame=train)

# End : Generalized Linear Modeling
# ----------------------------------------------------------------------------------------------------------------------

# Prediction
# ----------------------------------------------------------------------------------------------------------------------
print "Begin Prdiction"
print "---------------"

# ground truth
tY = np.array(testing_frame['RUL'])

示例#20

0

显示文件

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = missing_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(training_frame=fr)
     return model

示例#21

0

显示文件

文件： eyestate_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = eyestate_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="eyeDetection", training_frame=fr)
     return model

示例#22

0

显示文件

    def train(self, x=None, y=None, training_frame=None):
        """
        Train the rulefit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """

        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                family = "multinomial"
                raise H2OValueError("multinomial use cases not yet supported")
            else:
                family = "binomial"
        else:
            if self.glm_params.get("family") is not None:
                family = self.glm_params.get("family")
                self.glm_params.pop("family")
            else:
                family = "gaussian"

        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_rule_len, self.max_rule_len + 1)
        tree_models = dict()
        for model_idx in range(len(depths)):

            # Train tree models
            tree_model = _tree_model(self.algorithm, depths[model_idx],
                                     self.seed, model_idx, self.tree_params)
            tree_model.train(y=y, x=x, training_frame=training_frame)
            tree_models[model_idx] = tree_model

            paths = tree_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = [
                "tree_{0}.{1}".format(str(model_idx), x)
                for x in paths.col_names
            ]
            paths_frame = paths_frame.cbind(paths)

        if self.max_num_rules:
            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(
                model_id="glm.hex",
                seed=self.seed,
                family=family,
                alpha=1,
                max_active_predictors=self.max_num_rules + 1,
                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        else:
            # Get optimal lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                nfolds=self.nfolds,
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_search=True,
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

            lambda_ = _get_glm_lambda(glm)

            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_=lambda_,
                                                solver="COORDINATE_DESCENT",
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        # Get Intercept
        intercept = _get_intercept(glm)

        # Get Rules
        rule_importance = _get_rules(glm, tree_models, self.algorithm)

        self.intercept = intercept
        self.rule_importance = rule_importance
        self.glm = glm
        self.tree_models = tree_models

示例#23

0

显示文件

文件： pyunit_pojo_import.py 项目： h2oai/h2o-3

def generate_and_import_combined_pojo():
    if sys.version_info[0] < 3:  # Python 2
        print("This example needs Python 3.x+")
        return

    weather_orig = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/weather.csv"))
    weather = weather_orig  # working copy

    features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"})
    features.sort()
    response = "RISK_MM"

    glm_model = H2OGeneralizedLinearEstimator()
    glm_model.train(x=features, y=response, training_frame=weather)
    glm_preds = glm_model.predict(weather)

    gbm_model = H2OGradientBoostingEstimator(ntrees=5)
    gbm_model.train(x=features, y=response, training_frame=weather)
    gbm_preds = gbm_model.predict(weather)

    # Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same)
    weather = weather.drop("ChangeTemp")
    weather = weather.drop("ChangeTempDir")

    combined_pojo_path = generate_combined_pojo(glm_model, gbm_model)
    print("Combined POJO was stored in: " + combined_pojo_path)

    # FIXME: https://h2oai.atlassian.net/browse/PUBDEV-8561 We need to make this work for upload_mojo as well
    pojo_model = h2o.import_mojo(combined_pojo_path)

    # Testing begins

    # Sanity test - test parameterization that delegates to GLM
    weather["Bias"] = 1  # behave like GLM
    pojo_glm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_glm_preds.as_data_frame(),
                       glm_preds.as_data_frame())

    # Sanity test - test parameterization that delegates to GBM
    weather["Bias"] = 0  # behave like GBM
    pojo_gbm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_gbm_preds.as_data_frame(),
                       gbm_preds.as_data_frame())

    # Test per-segment specific behavior, segments are defined by ChangeWindDirect
    weather["Bias"] = float("NaN")
    for change_wind_dir in weather["ChangeWindDirect"].levels()[0]:
        weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir]
        weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] ==
                                        change_wind_dir]
        pojo_weather_cwd_preds = pojo_model.predict(weather_cwd)
        if change_wind_dir == "c" or change_wind_dir == "l":
            expected = glm_model.predict(weather_orig_cwd) * 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "n":
            expected = (glm_model.predict(weather_orig_cwd) +
                        gbm_model.predict(weather_orig_cwd)) / 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "s":
            expected = gbm_model.predict(weather_orig_cwd)
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())

示例#24

0

显示文件

文件： titanic_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = titanic_frame()
     fr["parch"] = fr["parch"].asfactor()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="parch", training_frame=fr, ignored_columns=["name", "ticket", "boat", "home.dest"])
     return model

示例#25

0

显示文件

文件： iris_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = iris_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="Species", training_frame=fr)
     return model

示例#26

0

显示文件

文件： cars_glm_recipe.py 项目： h2oai/mojoland

 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = cars_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="mpg", training_frame=fr, ignored_columns=["name"])
     return model

示例#27

0

显示文件

response_column = Dataset.RESPONSE_COLUMN
input_columns.remove('city')

# Start h2o server
h2o.init()

# Create h2o frame
training_frame = h2o.H2OFrame(pd_train)
training_frame.set_names(list(pd_train.columns))

# Measurements
mae = [] # Mean Absolute Errors for model
rmse = [] # Root Mean Squared Errors for model

for i in range(n_iterations):
    model = H2OGeneralizedLinearEstimator(nfolds=10)
    model.train(x=input_columns, y=response_column, training_frame=training_frame)

    mae.append(model.mae())
    rmse.append(model.rmse())

print("Model : Single")
print("--------------")
print("Average MAE       : " + str(numpy.average(mae)))
print("Average RMSE      : " + str(numpy.average(rmse)))
print("MAE Standard Dev  : " + str(numpy.std(mae)))
print("RMSE Standard Dev : " + str(numpy.std(rmse)))

示例#28

0

显示文件

文件： rulefit.py 项目： Fahad021/h2o-tutorials

    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
              validation_frame=None, **params):
        """
        Train the rulfit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """
        family = "gaussian"
        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                raise H2OValueError("Multinomial not supported")
            else:
                family = "binomial"


        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_depth, self.max_depth + 1)
        rf_models = []
        for model_idx in range(len(depths)):

            # Train random forest models
            rf_model = H2ORandomForestEstimator(seed = self.seed, 
                                                model_id = "rf.hex", 
                                                max_depth = depths[model_idx])
            rf_model.train(y = y, x = x, training_frame = training_frame)
            rf_models = rf_models + [rf_model]

            paths = rf_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = ["rf_" + str(model_idx) +"."+ x for x in paths.col_names]
            paths_frame = paths_frame.cbind(paths)

        # Extract important paths
        glm = H2OGeneralizedLinearEstimator(model_id = "glm.hex", 
                                            nfolds = self.nfolds, 
                                            seed = self.seed,
                                            family = family,
                                            alpha = 1, 
                                            remove_collinear_columns=True,
                                            lambda_search = True)
        glm.train(y = y, training_frame=paths_frame)

        intercept, rule_importance = _get_glm_coeffs(glm)
        rule_importance = pd.DataFrame.from_dict(rule_importance, orient = "index").reset_index()
        rule_importance.columns = ["variable", "coefficient"]

        # Convert paths to rules
        rules = []
        for i in rule_importance.variable:
            if family == "binomial":
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").replace("C1.", "").split(".")
            else:
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").split(".")
            tree = H2OTree(rf_models[int(model_num)], int(tree_num)-1)
            rules = rules + [_tree_traverser(tree.root_node, path)]

        # Add rules and order by absolute coefficient
        rule_importance["rule"] = rules
        rule_importance["abs_coefficient"] = rule_importance["coefficient"].abs()
        rule_importance = rule_importance.loc[rule_importance.groupby(["rule"])["abs_coefficient"].idxmax()]  
        rule_importance = rule_importance.sort_values(by = "abs_coefficient", ascending = False)
        rule_importance = rule_importance.drop("abs_coefficient", axis = 1)
        
        self.intercept = intercept
        self.rule_importance = rule_importance