示例#1
0
def create_fold_column_if_not_exist(h2o_base_table: h2o.H2OFrame,
                                    fold_column: str,
                                    nfolds: int = None) -> h2o.H2OFrame:
    if fold_column and fold_column not in h2o_base_table.col_names:
        h2o_fold_col = h2o_base_table.kfold_column(n_folds=nfolds)
        h2o_fold_col.set_names([fold_column])
        h2o_base_table = h2o_base_table.cbind(h2o_fold_col)
    return h2o_base_table
def _train_test_split_as_frames(x, y, is_str=False, is_classifier=False):
    y = y.astype(np.str) if is_str else y.astype(np.int64)
    x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.3, random_state=42)
    f_train_x = H2OFrame(x_train)
    f_train_y = H2OFrame(y_train)
    f_train = f_train_x.cbind(f_train_y)
    if is_classifier:
        f_train[f_train.ncol - 1] = f_train[f_train.ncol - 1].asfactor()
    return f_train, x_test.astype(np.float32)
def bernoulli_synthetic_data_gbm_medium():

  # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R)
  train_rows = 10000
  train_cols = 10

  #  Generate variables V1, ... V10
  X_train = np.random.randn(train_rows, train_cols)

  #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
  y_train = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in
                                                                                   np.multiply(X_train,X_train).tolist()]])

  # Train scikit gbm
  # TODO: grid-search
  distribution = "bernoulli"
  ntrees = 150
  min_rows = 1
  max_depth = 2
  learn_rate = .01
  nbins = 20

  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(X_train,y_train)

  # Generate testing dataset
  test_rows = 2000
  test_cols = 10

  #  Generate variables V1, ... V10
  X_test = np.random.randn(test_rows, test_cols)

  #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
  y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in
                                                                                  np.multiply(X_test,X_test).tolist()]])

  # Score (AUC) the scikit gbm model on the test data
  auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1])

  # Compare this result to H2O
  train_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_train, X_train)).tolist()))
  test_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_test, X_test)).tolist()))

  gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees,
                    min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins)
  gbm_perf = gbm_h2o.model_performance(test_h2o)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert abs(auc_h2o - auc_sci) < 1e-2, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \
                                        "scickit auc: {1}".format(auc_h2o, auc_sci)
def fit_h2o(x_train, y_train, estimator):
    parameters = estimator._parms
    estimator_type = estimator.__class__
    current_estimator = estimator_type()
    current_estimator._parms = parameters
    column_types_x = get_h2o_column_types(x_train.columns)
    x_train = H2OFrame(x_train, column_types=column_types_x)
    y_train = H2OFrame(list(y_train), column_types=['enum'])
    training_frame = x_train.cbind(y_train) if y_train is not None else x_train
    x_train = x_train.names
    y_train = y_train.names[0]
    current_estimator.train(x_train, y_train, training_frame)
    return current_estimator
示例#5
0
def test1():
    badFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"], "three": [0, 5.2, 14]})
    badClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"], "three": [0, 5.2, 14]})
    compare_frames(badFrame, badClone)
    
    try:
      badFrame.asfactor()
      assert False, "The frame contaied a real number, an error should be thrown"
    except H2OValueError: # as designed
      pass
        
    compare_frames(badFrame, badClone)

    originalAfterOp = H2OFrame.get_frame(badFrame.frame_id)
    compare_frames(badFrame, originalAfterOp)

    goodFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]})
    goodClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]})
    compare_frames(goodFrame, goodClone)

    factoredFrame = goodFrame.asfactor()

    originalAfterOp = H2OFrame.get_frame(goodFrame.frame_id)
    compare_frames(goodFrame, originalAfterOp)

    expectedFactoredFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}, column_types={"one":"categorical", "two": "enum"})

    compare_frames(expectedFactoredFrame, factoredFrame)

    refactoredFrame = expectedFactoredFrame.asfactor()
    factoredAfterOp = H2OFrame.get_frame(refactoredFrame.frame_id)
    compare_frames(expectedFactoredFrame, factoredAfterOp)
def predict_dataframe(data, node):
    if node.classifier:
        if isinstance(node.classifier, h2o.estimators.H2OEstimator):
            if not isinstance(data, H2OFrame):
                column_types = get_h2o_column_types(data.columns)
                data_h2o = H2OFrame(data, column_types=column_types)
            prediction = node.classifier.predict(data_h2o)
            if len(prediction['predict'].as_data_frame().values) == 0:
                prediction = np.array([])
            else:
                prediction = np.concatenate(
                    prediction['predict'].as_data_frame().values)
        else:
            prediction = node.classifier.predict(data)
        data_right = data.iloc[[
            i for i in range(len(prediction)) if prediction[i] == 1
        ]]
        data_left = data.iloc[[
            i for i in range(len(prediction)) if prediction[i] == 0
        ]]
        prediction_left = predict_dataframe(data_left, node.left_node)
        prediction_right = predict_dataframe(data_right, node.right_node)
        return sorted(prediction_left + prediction_right)
    else:
        return [(i, list(node.classes)[0]) for i in list(data.index)]
示例#7
0
 def retrieve_h2o_base_table_predictors(self, h2o_base_table: h2o.H2OFrame):
     cols_to_drop = [
         'row_id',
         self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.LABEL_COL],
         self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.ORIGIN_COL]
     ] + self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.CATEGORICAL_VARIABLES]
     return h2o_base_table.drop(cols_to_drop).col_names
示例#8
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
示例#9
0
def get_expected_output_frame(out_doc_ids, out_tokens, out_TFs, out_IDFs,
                              out_TFIDFs):
    return H2OFrame(
        OrderedDict([('DocID', out_doc_ids), ('Token', out_tokens),
                     ('TF', out_TFs), ('IDF', out_IDFs),
                     ('TF_IDF', out_TFIDFs)]),
        column_types=['numeric', 'string', 'numeric', 'numeric', 'numeric'])
    def optimum_threshold(self, hf: h2o.H2OFrame, model: H2OGenericEstimator) -> float:
        """ Selects the best threshold for this model given the cost values of this instance

        Args:
            hf (DataFrame): Data used for evaluation. Must contain ground truth column named fraud
            model (H2OModel): A model object to be evaluated
        Returns: optimum_threshold (float): Indicates that if a model p1 value is less than this number
                                            the prediction is 0 (not fraud). If the model p1 value is greater than
                                            this number the prediction is 1 (fraud)
        """
        # Extract the probability of the positive class from the predictions
        df = hf.as_data_frame()
        df['model_score'] = model.predict(test_data=hf).as_data_frame()['p1']

        matrix = {str(model.model_id): {'x': [], 'y': []}}
        # Calculate cost function for ever 1/100 ranging from 0 to 1
        for t in range(1, 100):
            t = t / 100
            df['prediction'] = predict(df, t, 1, 'model_score')
            df = reconcile(df, 'prediction', 'fraud', f"CM_{t}")
            t_cost, df = outcome(df, self.inverse_costs, f"CM_{t}", f"costs_{t}")
            matrix[str(model.model_id)]['x'].append(t)
            matrix[str(model.model_id)]['y'].append(t_cost)

        # Return threshold that produced the minimum cost
        idx_min_cost = matrix[str(model.model_id)]['y'].index(min(matrix[str(model.model_id)]['y']))
        optimum_threshold = matrix[str(model.model_id)]['x'][idx_min_cost]
        print(f"optimum_threshold: {optimum_threshold}")
        return optimum_threshold
示例#11
0
def upload_file():


    a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    print a.describe()

    from h2o import H2OFrame


    # using lists []
    py_list_to_h2o = H2OFrame.fromPython(zip(*[[0, 1, 2, 3, 4]]))

    print py_list_to_h2o.describe()

    py_list_to_h2o_2 = H2OFrame.fromPython(zip(*[[0, 1, 2, 3], [5, 6, "hi", "dog"]]))

    print py_list_to_h2o_2.describe()


    # using tuples ()
    py_tuple_to_h2o = H2OFrame.fromPython(zip(*[(0, 1, 2, 3, 4)]))

    print py_tuple_to_h2o.describe()

    py_tuple_to_h2o_2 = H2OFrame.fromPython(zip(*((0, 1, 2, 3), (5, 6, "hi", "dog"))))

    print py_tuple_to_h2o_2.describe()


    # using dicts {}
    py_dict_to_h2o = H2OFrame.fromPython({"column1": [5, 4, 3, 2, 1],
                                          "column2": (1, 2, 3, 4, 5)})

    py_dict_to_h2o.describe()

    py_dict_to_h2o_2 = H2OFrame.fromPython({"colA": ["bilbo", "baggins"], "colB": ["meow"]})

    print py_dict_to_h2o_2.describe()


    # using collections.OrderedDict

    import collections
    d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]}  # still unordered!
    py_ordered_dict_to_h2o = H2OFrame.fromPython(collections.OrderedDict(d))

    py_ordered_dict_to_h2o.describe()


    # make an ordered dictionary!
    d2 = collections.OrderedDict()
    d2["colA"] = ["bilbo", "baggins"]
    d2["colB"] = ["meow"]


    py_ordered_dict_to_h2o_2 = H2OFrame.fromPython(collections.OrderedDict(d2))
    py_ordered_dict_to_h2o_2.describe()
示例#12
0
def upload_file(ip, port):
    h2o.init(ip, port)

    a = h2o.upload_file("../../smalldata/logreg/prostate.csv")
    print a.describe()

    from h2o import H2OFrame


    # using lists []
    py_list_to_h2o = H2OFrame(python_obj=[0, 1, 2, 3, 4])

    print py_list_to_h2o.describe()

    py_list_to_h2o_2 = H2OFrame(python_obj=[[0, 1, 2, 3], [5, 6, "hi", "dog"]])

    print py_list_to_h2o_2.describe()


    # using tuples ()
    py_tuple_to_h2o = H2OFrame(python_obj=(0, 1, 2, 3, 4))

    print py_tuple_to_h2o.describe()

    py_tuple_to_h2o_2 = H2OFrame(python_obj=((0, 1, 2, 3), (5, 6, "hi", "dog")))

    print py_tuple_to_h2o_2.describe()


    # using dicts {}
    py_dict_to_h2o = H2OFrame(python_obj={"column1": [5, 4, 3, 2, 1],
                                          "column2": (1, 2, 3, 4, 5)})

    py_dict_to_h2o.describe()

    py_dict_to_h2o_2 = H2OFrame(python_obj={"colA": ["bilbo", "baggins"], "colB": ["meow"]})

    print py_dict_to_h2o_2.describe()


    # using collections.OrderedDict

    import collections
    d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]}  # still unordered!
    py_ordered_dict_to_h2o = H2OFrame(python_obj=collections.OrderedDict(d))

    py_ordered_dict_to_h2o.describe()


    # make an ordered dictionary!
    d2 = collections.OrderedDict()
    d2["colA"] = ["bilbo", "baggins"]
    d2["colB"] = ["meow"]


    py_ordered_dict_to_h2o_2 = H2OFrame(python_obj=collections.OrderedDict(d2))
    py_ordered_dict_to_h2o_2.describe()
def upload_file():

    a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    print(a.describe())

    from h2o import H2OFrame

    # using lists []
    py_list_to_h2o = H2OFrame([[0, 1, 2, 3, 4]])

    print(py_list_to_h2o.describe())

    py_list_to_h2o_2 = H2OFrame([[0, 1, 2, 3], [5, 6, "hi", "dog"]])

    print(py_list_to_h2o_2.describe())

    # using tuples ()
    py_tuple_to_h2o = H2OFrame([(0, 1, 2, 3, 4)])

    print(py_tuple_to_h2o.describe())

    py_tuple_to_h2o_2 = H2OFrame(((0, 1, 2, 3), (5, 6, "hi", "dog")))

    print(py_tuple_to_h2o_2.describe())

    # using dicts {}
    py_dict_to_h2o = H2OFrame({
        "column1": [5, 4, 3, 2, 1],
        "column2": (1, 2, 3, 4, 5)
    })

    py_dict_to_h2o.describe()

    py_dict_to_h2o_2 = H2OFrame({
        "colA": ["bilbo", "baggins"],
        "colB": ["meow"]
    })

    print(py_dict_to_h2o_2.describe())

    # using collections.OrderedDict

    import collections
    d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]}  # still unordered!
    py_ordered_dict_to_h2o = H2OFrame(collections.OrderedDict(d))

    py_ordered_dict_to_h2o.describe()

    # make an ordered dictionary!
    d2 = collections.OrderedDict()
    d2["colA"] = ["bilbo", "baggins"]
    d2["colB"] = ["meow"]

    py_ordered_dict_to_h2o_2 = H2OFrame(collections.OrderedDict(d2))
    py_ordered_dict_to_h2o_2.describe()
def _assert_expr_results_eq(expr_provider, skip_expr_assert=False):
    flag = h2o.is_expr_optimizations_enabled()
    try:
        # Get result of optimized expression
        h2o.enable_expr_optimizations(True)
        opt_expr = expr_provider()
        opt_result = H2OFrame._expr(opt_expr)
        # Get result of full expression
        h2o.enable_expr_optimizations(False)
        noopt_expr = expr_provider()
        noopt_result = H2OFrame._expr(noopt_expr)
        if not skip_expr_assert:
            assert opt_expr._debug_print() != noopt_expr._debug_print(), "The optimization should simplify expression!"
        assert noopt_result.as_data_frame(use_pandas=False) == opt_result.as_data_frame(
            use_pandas=False), "Results with/without expression optimization should match!"
        return opt_expr, noopt_expr
    finally:
        h2o.enable_expr_optimizations(flag)
示例#15
0
def add_unique_row_id(h2o_base_table: h2o.H2OFrame):
    num_rows = h2o_base_table.shape[0]

    ids = []
    for id in range(0, num_rows):
        ids.append(id)

    h2o_id_frame = h2o.H2OFrame(ids)
    return h2o_base_table.cbind(h2o_id_frame.set_names(['row_id']))
示例#16
0
 def predict_with_probabilities(self, data):
     data_frame = H2OFrame(data, column_names=self._column_names)
     preds = self._mojo_model.predict(data_frame).as_data_frame(use_pandas=True)
     if len(preds.columns) == 1:
         return [preds.to_numpy()]
     else:
         return [
             preds.iloc[:, 0].to_numpy().astype(np.str),
             preds.iloc[:, 1:].to_numpy()
         ]
def fold_prediction_result(x_train, y_train, x_test, y_test,
                           classification_types, basic_classifier):
    """
    The training and prediction for one fold for all the types of classifiers indicated in classification_types
    :param x_train: the training data
    :param y_train: the training classes
    :param x_test: the testing data
    :param y_test: the testing classes
    :param classification_types: the classification types to be considered
    :param basic_classifier: the basic classifier to be used either independently or for the meta classifiers
    :return: metrics_dict - dictionary containing a dictionary for every metric with data for every classifier
             training_time - dictionary with training time in seconds for every classification type
             test_time - dictionary with testing time in seconds for every classification type
    """
    metrics_dict = {}
    for metric in METRICS:
        metrics_dict[metric] = {}
    training_time = {}
    test_time = {}
    for classification in classification_types:
        # logger.info("*****************************")
        logger.info(classification)
        if classification in ENCODING_TYPES:
            classifier = EncodedClassifier(basic_classifier,
                                           encoding_type=classification)
        elif classification == "meta_binary_tree_classifier":
            classifier = MetaBinaryTreeClassifier(basic_classifier)
        elif classification == "standard-classifier":
            classifier = basic_classifier
        else:
            raise Exception("The Classification Method is not a valid one")
        start_time = time.time()
        if isinstance(classifier, h2o.estimators.H2OEstimator):
            classifier = fit_h2o(x_train, y_train, classifier)
        else:
            classifier.fit(x_train, y_train)
        train_time = time.time() - start_time
        if isinstance(classifier, h2o.estimators.H2OEstimator):
            column_types = get_h2o_column_types(x_test.columns)
            x_test = H2OFrame(x_test, column_types=column_types)
            prediction = classifier.predict(x_test)
            y_pred = np.concatenate(
                prediction['predict'].as_data_frame().values)
        else:
            y_pred = classifier.predict(x_test)
        prediction_time = time.time() - train_time - start_time
        # Calculate metrics
        for metric, f in METRICS.items():
            metrics_dict[metric][classification] = f(y_test, y_pred)
        training_time[classification] = train_time
        test_time[classification] = prediction_time

    return metrics_dict, training_time, test_time
示例#18
0
def test_pav(y, X, w):
    X = X.reshape(-1)
    # run Isotonic Regression to extract thresholds
    iso_reg = IsotonicRegression().fit(X, y, w)
    thresholds_scikit = H2OFrame(np.column_stack(get_thresholds(iso_reg)))
    print(thresholds_scikit.as_data_frame())

    # now invoke H2O PAVA
    thresholds_h2o = pav(y, X, w)
    print(thresholds_h2o.as_data_frame())

    assert_frame_equal(thresholds_scikit.as_data_frame(),
                       thresholds_h2o.as_data_frame())
示例#19
0
def pubdev_6394():
    # JUnit tests are to be found in RapidsTest class

    data = [['location'], ['X県 A市'], ['X県 B市'], ['X県 B市'], ['Y県 C市'],
            ['Y県 C市']]

    original_frame = H2OFrame(data, header=True, column_types=['enum'])

    assert original_frame.type('location') == 'enum'
    assert original_frame.categories() == [u'X県 A市', u'X県 B市', u'Y県 C市']

    # Reduce cardinality of 'location' column to 2 by reducing existing categorical values to ['X県','Y県']
    expected_categories = [u'X県', u'Y県']
    transformed_frame = original_frame['location'].gsub(' .*', '')
    print(transformed_frame)

    assert transformed_frame.ncols == 1
    assert transformed_frame.nrows == original_frame.nrows
    assert transformed_frame.type('C1') == 'enum'
    assert transformed_frame['C1'].categories() == expected_categories

    # Test gsub without changing the cardinality

    data = [['location'], ['ab'], ['ac'], ['ad'], ['ae'], ['af']]

    original_frame = H2OFrame(data, header=True, column_types=['enum'])
    assert original_frame.type('location') == 'enum'
    assert original_frame.categories() == ['ab', 'ac', 'ad', 'ae', 'af']

    expected_categories = ['b', 'c', 'd', 'e', 'f']
    transformed_frame = original_frame['location'].gsub('a', '')
    print(transformed_frame)

    assert transformed_frame.ncols == 1
    assert transformed_frame.nrows == original_frame.nrows
    assert transformed_frame.type('C1') == 'enum'
    assert transformed_frame['C1'].categories() == expected_categories
def process_w2v(df, w2v_model):
    """ returns new df with text-features all replaced by word2vec features """
    print("processind data with word2vec ...")
    df = df.copy()
    text_columns = w2v_model.text_columns
    df_text = df[text_columns]
    text_frame = H2OFrame(df_text)
    for col in text_columns:
        text_frame[col] = text_frame[col].ascharacter()

    words = text_frame.tokenize(" ")
    text_feats = w2v_model.transform(words, aggregate_method = "AVERAGE")
    text_feats = text_feats.as_data_frame()
    df.drop(columns=text_columns, inplace=True)
    return pd.concat([df,text_feats], axis=1).reset_index()
示例#21
0
 def predict(self, x):
     if isinstance(self.estimator, h2o.estimators.H2OEstimator):
         column_types_x = get_h2o_column_types(x.columns)
         x = H2OFrame(x, column_types=column_types_x)
         results = pd.DataFrame(index=range(len(x)))
         i = 0
         for estimator in self.estimators_:
             predictions = estimator.predict(x)
             results[i] = predictions['predict'].as_data_frame().values
             i += 1
     else:
         results = np.array(
             [estimator.predict(x) for estimator in self.estimators_]).T
         results = pd.DataFrame(results)
     y_pred = decode_users(results, self.dict_code_user)
     return np.array(y_pred)
示例#22
0
    def Predict(self, request: IrisRequest, context):
        if not hasattr(request, 'SepalLength') or not hasattr(request, 'SepalWidth') \
                or not hasattr(request, 'PetalLength') or not hasattr(request, 'PetalWidth'):
            msg = 'wrong arguments for IrisRequest'
            context.set_details(msg)
            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)

        test_data = H2OFrame({
            "SepalLength": request.SepalLength,
            "SepalWidth": request.SepalWidth,
            "PetalLength": request.PetalLength,
            "PetalWidth": request.PetalWidth
        })
        prediction = self.model.predict(test_data).getrow()
        species = SPECIES.get(np.argmax(prediction))
        return IrisReply(species=species)
示例#23
0
def merge_ages(frame, ages):
    df = frame.merge(ages, all_x=True).sort('PassengerId').as_data_frame()
    missing_rows = df['Age'].isna()
    df.loc[missing_rows, 'Age'] = df.loc[missing_rows, 'predict']

    # For odds turns of fate, need to convert the response var back to factor
    # exactly here. (If the Pandas frame is converted to an H2O frame, the
    # response column becomes a real number instead of an integer and H2O can
    # convert integers to factors, but it cannot convert real numbers to factors.
    df['Survived_factor'] = df['Survived_factor'].astype('category')

    merged_frame = H2OFrame(df)
    # Somehow, the columns, some columns get corrupted in by the merge
    copy_df = h2o.deep_copy(merged_frame, 'copy_df')
    copy_df['Age'] = merged_frame.pop('Age')
    return copy_df.drop('predict')
def grouped_kfold(frame: H2OFrame, n_folds: int, src_col_name: str, dest_col_name='_kfold', seed=-1, remove_frame=True):
    src_col_frame = frame[src_col_name]
    group_col_uniq = src_col_frame.unique()

    print(f"kfold group unique val count:: {src_col_name}, {group_col_uniq.nrows}")
    kfold_col_frame = group_col_uniq.kfold_column(n_folds, seed)
    group_col_kfold = group_col_uniq.cbind(kfold_col_frame)
    group_col_kfold_named = group_col_kfold.set_names([src_col_name, dest_col_name])
    kfold_frame = frame.merge(group_col_kfold_named)
    # force eval...
    print(f"merged frame id: {kfold_frame.frame_id}")

    remove_frames([src_col_frame, group_col_uniq, group_col_kfold, kfold_col_frame, group_col_kfold_named])

    if remove_frame:
        h2o.remove(frame.frame_id)
    return kfold_frame
示例#25
0
def build_audit_h2o(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)
示例#26
0
def predict_row(row, node):
    if node.classifier:
        if isinstance(node.classifier, h2o.estimators.H2OEstimator):
            if not isinstance(row, H2OFrame):
                column_types_row = get_h2o_column_types(row.columns)
                row = H2OFrame(row, column_types=column_types_row)
            prediction = node.classifier.predict(row)
            prediction = np.concatenate(
                prediction['predict'].as_data_frame().values)
        else:
            prediction = node.classifier.predict(row)
        if prediction[0] == 0:
            prediction = predict_row(row, node.left_node)
        else:
            prediction = predict_row(row, node.right_node)
    else:
        return list(node.classes)[0]

    return prediction
def pubdev_6534():
    df_data = [["D", "E", "NA", "NA"], ["1", "A", "NA", "NA"]]
    df = H2OFrame.from_python(df_data,
                              column_types=['factor'] * 4,
                              na_strings=["NA"])

    assert df.type("C1") == "enum"
    assert df.type("C2") == "enum"
    assert df.type("C3") == "int"
    assert df.type("C4") == "int"

    # convert empty col to enum
    df['C3'] = df['C3'].asfactor()
    # convert empty cols to char
    df['C4'] = df['C4'].ascharacter()

    print(df)
    assert df.type("C3") == "enum"
    assert df.type("C4") == "string"
示例#28
0
def _prepare_one_hot(file, y, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    dir_path = os.path.dirname(os.path.realpath(__file__))
    frame = h2o.import_file(dir_path + "/" + file)
    train, test = frame.split_frame([0.95], seed=42)

    cols_to_encode = []
    other_cols = []
    for name, ctype in test.types.items():
        if name == y or name in exclude_cols:
            pass
        elif ctype == "enum":
            cols_to_encode.append(name)
        else:
            other_cols.append(name)
    train_frame = train.as_data_frame()
    train_encode = train_frame.loc[:, cols_to_encode]
    train_other = train_frame.loc[:, other_cols + [y]]
    enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
    enc.fit(train_encode)
    colnames = []
    for cidx in range(len(cols_to_encode)):
        for val in enc.categories_[cidx]:
            colnames.append(cols_to_encode[cidx] + "." + val)

    train_encoded = enc.transform(train_encode.values).toarray()
    train_encoded = pd.DataFrame(train_encoded)
    train_encoded.columns = colnames
    train = train_other.join(train_encoded)
    train = H2OFrame(train)

    test_frame = test.as_data_frame()
    test_encode = test_frame.loc[:, cols_to_encode]
    test_other = test_frame.loc[:, other_cols]

    test_encoded = enc.transform(test_encode.values).toarray()
    test_encoded = pd.DataFrame(test_encoded)
    test_encoded.columns = colnames
    test = test_other.join(test_encoded)

    return train, test
示例#29
0
    def model_performance(self, test_data=None):
        """
        Compute the binary classifier model metrics on `test_data`
        :param test_data: An H2OFrame
        :return: A H2OBinomialMetrics object; prints model metrics summary
        """

        if not test_data:
            raise ValueError("Missing`test_data`.")

        if not isinstance(test_data, H2OFrame):
            raise ValueError("`test_data` must be of type H2OFrame. Got: " +
                             type(test_data))

        fr_key = H2OFrame.send_frame(test_data)

        url_suffix = "ModelMetrics/models/" + self._key + "/frames/" + fr_key
        res = H2OConnection.post_json(url_suffix=url_suffix)
        raw_metrics = res["model_metrics"][0]
        return H2OBinomialModelMetrics(raw_metrics)
def test_fold_optimization_rbind_expr():
    data0 = square_matrix(3, 0)
    data1 = square_matrix(3, 1)
    data2 = square_matrix(3, 2)

    def get_expr():
        return ExprNode("rbind", ExprNode("rbind", ExprNode("rbind", data0, data1), data0, data1),
                        data2)

    (expr, _) = _assert_expr_results_eq(get_expr)

    assert expr._op == "rbind", "Result operator is still cbind"
    assert len(expr._children) == 5, "Results has 5 arguments"

    fr = H2OFrame._expr(expr)
    assert fr.dim == [15, 3]
    assert fr.as_data_frame(use_pandas=False, header=False) == [['0'] * 3, ['0'] * 3, ['0'] * 3,
                                                                ['1'] * 3, ['1'] * 3, ['1'] * 3,
                                                                ['0'] * 3, ['0'] * 3, ['0'] * 3,
                                                                ['1'] * 3, ['1'] * 3, ['1'] * 3,
                                                                ['2'] * 3, ['2'] * 3, ['2'] * 3]
示例#31
0
def classify(x_train, y_train, estimator, x_test):
    """
    Make the classification and provide the result for the given estimator.
    For the H2O library, the transformation in H2oFrame is integrated
    :param x_train: the dataset for training
    :param y_train: the classes for training
    :param estimator: the estimator to be considered
    :param x_test: the dataset for testing
    :return: - the prediction for the x_test
             - the trained estimator
    """
    if isinstance(estimator, h2o.estimators.H2OEstimator):
        current_estimator = fit_h2o(x_train, y_train, estimator)
        column_types_x = get_h2o_column_types(x_test.columns)
        x_test = H2OFrame(x_test, column_types=column_types_x)
        prediction = current_estimator.predict(x_test)
        return np.concatenate(
            prediction['predict'].as_data_frame().values), current_estimator
    else:
        current_estimator = clone(estimator)
        current_estimator.fit(x_train, y_train)
        return current_estimator.predict(x_test), current_estimator
示例#32
0
def pubdev_6393():
    locations = [['location'],
             ['�X県 A市 '], # First observation contains replacement character for unknown char
             ['X県 B市']]

    frame = H2OFrame(locations, header=True, column_types=['enum'])
    assert frame.ncols == 1
    assert frame.nrows == len(locations) - 1
    
    frame_categories= frame['location'].categories()
    print(frame_categories)
    
    frame_converted = frame['location'].ascharacter().asfactor()
    assert frame_converted.ncols == 1
    assert frame_converted.nrows == len(locations) - 1
    
    frame_converted_categories = frame_converted.categories();
    print(frame_converted_categories)
    
    # Check for the representation of categoricals to be exactly the same
    # No explicit check for any specific behavior, the behavior of Categorical and asFactor should be the same
    for i in range(0,len(frame_converted_categories)):
        assert frame_categories[i] == frame_converted_categories[i]
示例#33
0
def pubdev_6439():
    data = [
        ['C1'],
        [
            'X県 A市 '
        ],  # First observation contains replacement character for unknown char
        ['X県 B市']
    ]

    frame = H2OFrame(data, header=True, column_types=['enum'])

    frame_categories = frame['C1'].categories()
    print(frame_categories)

    # Two observations
    assert len(frame_categories) == 2
    assert len(
        frame_categories[0]
    ) == 6  # First observation has six characters (space at the end)
    assert len(
        frame_categories[1]
    ) == 5  # Second observation has 5 characters (missing space at the end)

    # Python 2 and 3 handle strings differently
    if (sys.version_info[0] == 3):
        assert ''.join(data[1]) == frame_categories[
            0]  # First categorical level equals to first observation
        assert ''.join(data[2]) == frame_categories[
            1]  # Second categorical levels equals to second observation
    elif (sys.version_info[0] == 2):
        assert ''.join(data[1]).decode("utf-8") == frame_categories[
            0]  # First categorical level equals to first observation
        assert ''.join(data[2]).decode("utf-8") == frame_categories[
            1]  # Second categorical levels equals to second observation
    else:
        assert False
def train_w2v(df, epochs=None, save_dir=None):
    """ trains word2vec model on all text columns of df.
        Returns w2v model object that can transform data.
    """
    print("training word2vec model ...")
    args = {}
    if epochs is not None:
        args['epochs'] = int(epochs)
    if save_dir is not None:
        args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/")

    df = df.copy()
    text_columns = get_text_cols(df)
    print("Text columns are: ", text_columns)
    df_text = df[text_columns]
    text_frame = H2OFrame(df_text)
    for col in text_columns:
        text_frame[col] = text_frame[col].ascharacter()

    words = text_frame.tokenize(" ")
    w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args)
    w2v_model.train(training_frame=words)
    w2v_model.text_columns = text_columns
    return w2v_model
示例#35
0
#!/usr/bin/env python

from h2o import H2OFrame
import h2o as h2o

localH2O = h2o.init()
air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False)
print(air.head())

air['RandNum'] = air.random.uniform()
print(air.head())

air_train = air.ix[air['RandNum'] <= 0.8]
air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)]
air_test  = air.ix[air['RandNum'] > 0.9]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "Month", "DayofMonth", "DayOfWeek"]
myY = "IsDepDelayed"

air_gbm = h2o.gbm(x = myX, y = myY, data = air_train, validation = air_valid,
                  distribution = "multinomial",
                  n_trees = 10, interaction_depth = 3, shrinkage = 0.01,
                  importance = True)
print(air_gbm)

pred = h2o.predict(air_gbm, air_test)
print(pred.head())
#!/usr/bin/env python

from h2o import H2OFrame, H2OModel
import h2o as h2o

localH2O = h2o.init()
air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False)
air.head().print()

X_air = air['Origin', 'Dest', 'Distance', 'UniqueCarrier', 'Month', 'DayofMonth', 'DayOfWeek']
y_air = air['IsDepDelayed']

X_air_train, X_air_valid, X_air_test, y_air_train, y_air_valid, y_air_test = \
  H2OFrame.train_valid_test(X_air, y_air, valid_size = 0.1, test_size = 0.1)

my_gbm = H2OModel.GBM(distribution = "multinomial", n_trees = 10,
                      interaction_depth = 3, shrinkage = 0.01,
                      importance = True)
air_gbm = my_gbm.fit(x=X_air_train, y=y_air_train, x_valid=X_air_valid, y_valid=y_air_valid)
air_gbm.print()

pred = air_gbm.predict(X_air_test)
pred.head().print()
示例#37
0
def temp_ctr():  return H2OFrame.temp_ctr()
def rest_ctr():  return h2o.H2OConnection.rest_ctr()