Exemplo n.º 1
0
    def _x_y_to_h2o_frame(X, y, sample_weight, params, X_valid, y_valid,
                          sample_weight_valid):
        if isinstance(X, (DataFrame, Series)) & isinstance(
                y, (DataFrame, Series)):
            features = X.columns.tolist() if isinstance(X,
                                                        DataFrame) else X.name
            target = y.columns.tolist() if isinstance(y, DataFrame) else y.name
            if (sample_weight is not None) & isinstance(
                    sample_weight, (DataFrame, Series)):
                params['offset_column'] = (sample_weight.columns.tolist() if
                                           isinstance(sample_weight, DataFrame)
                                           else sample_weight.name)
                X = concat([X, sample_weight], axis=1)
            train_set = H2OFrame(concat([X, y], axis=1))
        else:
            raise TypeError(
                'X, y are supposed to be pandas DataFrame or Series')

        if (X_valid is not None) & (y_valid is not None):
            if isinstance(X_valid, (DataFrame, Series)) & isinstance(
                    y_valid, (DataFrame, Series)):
                if ((sample_weight_valid is not None)
                        & isinstance(sample_weight_valid, (DataFrame, Series))
                        & (sample_weight is not None)):
                    X_valid = concat([X_valid, sample_weight_valid], axis=1)
                valid_set = H2OFrame(concat([X_valid, y_valid], axis=1))
                params['validation_frame'] = valid_set
            else:
                raise TypeError(
                    'X_valid, y_valid are supposed to be pandas DataFrame or Series'
                )
        return features, target, train_set, params
Exemplo n.º 2
0
def col_names_check():
    iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    expected_names = ["sepal_len", "sepal_wid", "petal_len", "petal_wid", "class"]
    assert iris_wheader.col_names == expected_names, \
        "Expected {0} for column names but got {1}".format(expected_names, iris_wheader.col_names)

    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    expected_names = ["C1", "C2", "C3", "C4", "C5"]
    assert iris.col_names == expected_names, \
        "Expected {0} for column names but got {1}".format(expected_names, iris.col_names)

    df = H2OFrame.from_python(np.random.randn(100, 4).tolist(), column_names=list("ABCD"), column_types=["enum"] * 4)
    df.head()
    expected_names = list("ABCD")
    assert df.col_names == expected_names, \
        "Expected {} for column names but got {}".format(expected_names, df.col_names)
    assert list(df.types.values()) == ["enum"] * 4, \
        "Expected {} for column types but got {}".format(["enum"] * 4, df.types)

    df = H2OFrame(np.random.randn(100, 4).tolist())
    df.head()
    expected_names = ["C1", "C2", "C3", "C4"]
    assert df.col_names == expected_names, \
        "Expected {} for column names but got {}".format(expected_names, df.col_names)
    assert list(df.types.values()) == ["real"] * 4, \
        "Expected {} for column types but got {}".format(["real"] * 4, df.types)

    df = H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']})
    df.head()
    assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names)

    df = H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"])
    df.head()
    assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
Exemplo n.º 3
0
def pubdev_6360():
    source = [[1, 'Peter', 'blah'], [2, 'Carl', ''], [3, 'Maria', 'whatever'],
              [4, 'Cindy', None]]
    expected = [[1, 'Peter', 1], [2, 'Carl', 0], [3, 'Maria', 1],
                [4, 'Cindy', 0]]
    columns = ['ID', 'Name', 'testcolumn']
    sourcePandasFrame = pd.DataFrame(source, columns=columns)
    expectedPandasFrame = pd.DataFrame(expected, columns=columns)

    h2oFrame = H2OFrame(sourcePandasFrame)
    h2oFrame[h2oFrame['testcolumn'] != '', 'testcolumn'] = '1'
    try:
        h2oFrame[h2oFrame['testcolumn'] == '', 'testcolumn'] = '0'
        assert False, "H2O Frame operation should fail on an enum column"
    except Exception as e:
        assert 'Cannot assign value 1 into a vector of type Enum.' == e.args[
            0].msg, "H2O Frame operation failed on an unexpected error"

    h2oFrame = H2OFrame(sourcePandasFrame)
    h2oFrame['testcolumn'] = h2oFrame['testcolumn'].ascharacter()
    h2oFrame[h2oFrame['testcolumn'] != '', 'testcolumn'] = '1'
    h2oFrame[h2oFrame['testcolumn'] == '', 'testcolumn'] = '0'
    h2oFrame['testcolumn'] = h2oFrame['testcolumn'].asfactor()

    assert_frame_equal(h2oFrame.as_data_frame(use_pandas=True),
                       expectedPandasFrame)
Exemplo n.º 4
0
def test_sw_602_endpoints_equality():
    data = [numpy.arange(0, 50000).tolist() for x in numpy.arange(0, 99).tolist()]
    fr = h2o.H2OFrame(data)
    full = H2OFrame.get_frame(fr.frame_id)
    light = H2OFrame.get_frame(fr.frame_id, light=True)

    assert full._ex._cache._id == light._ex._cache._id
    assert full._ex._cache._nrows == light._ex._cache._nrows
    assert full._ex._cache._ncols == light._ex._cache._ncols
    assert full._ex._cache._names == light._ex._cache._names
    assert full._ex._cache._data == light._ex._cache._data
    assert full._ex._cache._l == light._ex._cache._l
Exemplo n.º 5
0
def test_sw_602_endpoints_equality():
    data = [
        numpy.arange(0, 50000).tolist() for x in numpy.arange(0, 99).tolist()
    ]
    fr = h2o.H2OFrame(data)
    full = H2OFrame.get_frame(fr.frame_id)
    light = H2OFrame.get_frame(fr.frame_id, light=True)

    assert full._ex._cache._id == light._ex._cache._id
    assert full._ex._cache._nrows == light._ex._cache._nrows
    assert full._ex._cache._ncols == light._ex._cache._ncols
    assert full._ex._cache._names == light._ex._cache._names
    assert full._ex._cache._data == light._ex._cache._data
    assert full._ex._cache._l == light._ex._cache._l
Exemplo n.º 6
0
    def transform(self, is_train_or_valid, frame = None, holdout_type = None, noise = -1, seed = -1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param bool is_train_or_valid: explicitly specify type of the data.
        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type:
            Supported options:
                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
        :param float noise: amount of noise to add to the final target encodings.
        :param int seed: set to fixed value for reproducibility.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode("target.encoder.transform", encodingMapKeys, encodingMapFramesKeys, frame, self._teColumns, holdout_type,
                                            self._responseColumnName, self._foldColumnName,
                                            self._blending, self._inflectionPoint, self._smoothing,
                                            noise, seed, is_train_or_valid))
Exemplo n.º 7
0
 def result(self):
     """
     Get result frame that contains information about the model building process like for modelselection and anovaglm.
     :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
     """
     return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(
         fill_cache=True)
Exemplo n.º 8
0
    def as_frame(self):
        """
        Converts this collection of models to a tabular representation.

        :returns: An H2OFrame, first columns identify the input segments, rest of the columns describe the built models. 
        """
        return H2OFrame._expr(expr=ExprNode("segment_models_as_frame", ASTId(self._segment_models_id)))._frame(fill_cache=True)
Exemplo n.º 9
0
 def baseline_survival_frame(self):
     if (self._model_json is not None
             and self._model_json.get("output", {}).get(
                 "baseline_survival", {}).get("name") is not None):
         baseline_survival_name = self._model_json["output"][
             "baseline_survival"]["name"]
         return H2OFrame.get_frame(baseline_survival_name)
Exemplo n.º 10
0
    def __init__(self,
                 data,
                 features,
                 families="poisson",
                 max_depth=10,
                 iterations=1):
        self.data = data
        self.nD = data.shape[0]
        self.nF = data.shape[1]

        self.config = {"max_depth": max_depth, "iterations": iterations}

        self.vars = numpy.var(data, 0)
        self.means = numpy.mean(data, 0)

        assert self.nF == len(features)

        self.features = features

        if isinstance(families, str):
            families = [families] * self.nF

        updata = H2OFrame(numpytoordereddict(self.data, self.features))

        self.models = {}
        for i, feature in enumerate(self.features):
            if self.vars[i] == 0:
                continue

            self.models[feature] = H2OGradientBoostingEstimator(
                distribution="poisson", ntrees=iterations, max_depth=max_depth)
            self.models[feature].train(
                x=[f for f in self.features if f != feature],
                y=feature,
                training_frame=updata)
Exemplo n.º 11
0
    def predict(self, X, sample_weight=None, **kwargs):
        """Predict using GLM with feature matrix X.

        Args:
            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Samples.
            sample_weight (:obj:`pd.DataFrame`, :obj:`pd.Series`, optional): Test sample weights.
            **kwargs: Other parameters passed to H2OGeneralizedLinearEstimator.predict().

        Returns:
            array: Returns predicted values.
        """
        if (self.backend == 'sklearn') & isinstance(self.model, Pipeline):
            predictions = self.model.predict(X if not hasattr(
                self.model, 'feature_name_') else X[self.model.feature_name_])
        elif (self.backend == 'h2o') & isinstance(
                self.model, H2OGeneralizedLinearEstimator):
            if self.model.parms['offset_column'][
                    'actual_value'] is not None and sample_weight is None:
                offset_name = self.model.parms['offset_column'][
                    'actual_value']['column_name']
                sample_weight = Series(repeat(0, len(X)),
                                       name=offset_name,
                                       index=X.index)
            if sample_weight is not None:
                X = concat([X, sample_weight], axis=1)
            h2o_predict = X if isinstance(X, H2OFrame) else H2OFrame(X)
            predictions = self.model.predict(
                h2o_predict, **kwargs).as_data_frame().values.reshape(-1)
        else:
            raise NotImplementedError(
                f'Error with the backend choice. Supported backends: {self._backends}'
            )
        return predictions
Exemplo n.º 12
0
 def mapping_frame(self):
     if self._model_json is None:
         return None
     mj = self._model_json
     if mj.get("output", {}).get("mapping_frame", {}).get("name") is not None:
         mapping_frame_name = mj["output"]["mapping_frame"]["name"]
         return H2OFrame.get_frame(mapping_frame_name)
Exemplo n.º 13
0
    def transform(self, column):
        check_is_fitted(self, 'encoder_')
        column = h2o_col_to_numpy(column)

        # transform--
        # I don't like that we have to re-upload... but we do... for now...
        return H2OFrame.from_python(self.encoder_.transform(column).reshape(column.shape[0], 1))
Exemplo n.º 14
0
 def _as_h2o_frame_from_RDD_Double(h2oContext,
                                   rdd,
                                   frame_name,
                                   full_cols=-1):
     key = h2oContext._jhc.asH2OFrameFromPythonRDDDoubleKeyString(
         rdd._to_java_object_rdd(), frame_name)
     return H2OFrame.get_frame(key, full_cols=full_cols, light=True)
Exemplo n.º 15
0
    def event_log(self):
        """
        retrieve the backend event log from an H2OAutoML object

        :return: an H2OFrame with detailed events occurred during the AutoML training.
        """
        return H2OFrame([]) if self._event_log is None else self._event_log
Exemplo n.º 16
0
def from_pandas(X):
    """A simple wrapper for H2OFrame.from_python. This takes
    a pandas dataframe and returns an H2OFrame with all the 
    default args (generally enough) plus named columns.

    Parameters
    ----------

    X : pd.DataFrame
        The dataframe to convert.

    Returns
    -------

    H2OFrame
    """
    pd, _ = validate_is_pd(X, None)

    # older version of h2o are super funky with this
    if parse_version(h2o.__version__) < parse_version('3.10.0.7'):
        h = 1
    else:
        h = 0

    # if h2o hasn't started, we'll let this fail through
    return H2OFrame.from_python(X, header=h, column_names=X.columns.tolist())
Exemplo n.º 17
0
    def transform(self, frame=None, holdout_type=None, noise=-1, seed=-1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type: Supported options:

                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
                
        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode("target.encoder.transform", encodingMapKeys, encodingMapFramesKeys, frame, self._teColumns, holdout_type,
                                            self._responseColumnName, self._foldColumnName,
                                            self._blending, self._inflectionPoint, self._smoothing,
                                            noise, seed))
Exemplo n.º 18
0
    def transform(self, frame=None, holdout_type=None, noise=-1, seed=-1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type: Supported options:

                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
                
        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(
            map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode(
            "target.encoder.transform", encodingMapKeys, encodingMapFramesKeys,
            frame, self._teColumns, holdout_type, self._responseColumnName,
            self._foldColumnName, self._blending, self._inflectionPoint,
            self._smoothing, noise, seed))
Exemplo n.º 19
0
def from_pandas(X):
    """A simple wrapper for H2OFrame.from_python. This takes
    a pandas dataframe and returns an H2OFrame with all the 
    default args (generally enough) plus named columns.

    Parameters
    ----------

    X : pd.DataFrame
        The dataframe to convert.

    Returns
    -------

    H2OFrame
    """
    pd, _ = validate_is_pd(X, None)

    # older version of h2o are super funky with this
    if parse_version(h2o.__version__) < parse_version('3.10.0.7'):
        h = 1
    else:
        h = 0

    # if h2o hasn't started, we'll let this fail through
    return H2OFrame.from_python(X, header=h, column_names=X.columns.tolist())
Exemplo n.º 20
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              blending_frame=None,
              **kwargs):
        has_training_frame = training_frame is not None or self.training_frame is not None
        blending_frame = H2OFrame._validate(blending_frame,
                                            'blending_frame',
                                            required=not has_training_frame)

        if not has_training_frame:
            training_frame = blending_frame  # used to bypass default checks in super class and backend and to guarantee default metrics

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(
                    quoted(self.metalearner_fold_column))

        super(self.__class__, self)._train(x,
                                           y,
                                           training_frame,
                                           extend_parms_fn=extend_parms,
                                           **kwargs)
Exemplo n.º 21
0
 def from_java_h2o_frame(h2o_frame, h2o_frame_id):
     # Cache Java reference to the backend frame
     sid = h2o_frame_id.toString()
     fr = H2OFrame.get_frame(sid)
     fr._java_frame = h2o_frame
     fr._java_frame_sid = sid
     fr._backed_by_java_obj = True
     return fr
Exemplo n.º 22
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]"
     j = h2o.api("POST /99/Assembly", data={"steps": res, "frame": fr.frame_id})
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Exemplo n.º 23
0
 def from_java_h2o_frame(h2o_frame, h2o_frame_id):
     # Cache Java reference to the backend frame
     sid = h2o_frame_id.toString()
     fr = H2OFrame.get_frame(sid)
     fr._java_frame = h2o_frame
     fr._java_frame_sid = sid
     fr._backed_by_java_obj = True
     return fr
Exemplo n.º 24
0
 def _as_h2o_frame_from_dataframe(h2oContext,
                                  dataframe,
                                  frame_name,
                                  full_cols=100):
     j_h2o_frame = h2oContext._jhc.asH2OFrame(dataframe._jdf, frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame, j_h2o_frame_key,
                                         full_cols)
Exemplo n.º 25
0
 def _as_h2o_frame_from_RDD_String(h2oContext,
                                   rdd,
                                   frame_name,
                                   full_cols=100):
     j_h2o_frame = h2oContext._jhc.asH2OFrameFromRDDString(
         rdd._to_java_object_rdd(), frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame, j_h2o_frame_key,
                                         full_cols)
Exemplo n.º 26
0
 def from_java_h2o_frame(h2o_frame, h2o_frame_id, full_cols=100):
     # Cache Java reference to the backend frame
     sid = h2o_frame_id.toString()
     cols = full_cols if h2o_frame.numCols() > full_cols else -1
     fr = H2OFrame.get_frame(sid, full_cols=cols, light=True)
     fr._java_frame = h2o_frame
     fr._java_frame_sid = sid
     fr._backed_by_java_obj = True
     return fr
Exemplo n.º 27
0
 def from_java_h2o_frame(h2o_frame, h2o_frame_id, full_cols=100):
     # Cache Java reference to the backend frame
     sid = h2o_frame_id.toString()
     cols = full_cols if h2o_frame.numCols() > full_cols else -1
     fr = H2OFrame.get_frame(sid, full_cols=cols, light=True)
     fr._java_frame = h2o_frame
     fr._java_frame_sid = sid
     fr._backed_by_java_obj = True
     return fr
Exemplo n.º 28
0
 def _as_h2o_frame_from_complex_type(h2oContext,dataframe, framename):
     # Creates a DataFrame from an RDD of tuple/list, list or pandas.DataFrame.
     # On scala backend, to transform RDD of Product to H2OFrame, we need to know Type Tag.
     # Since there is no alternative for Product class in Python, we first transform the rdd to dataframe
     # and then transform it to H2OFrame.
     df = h2oContext._sqlContext.createDataFrame(dataframe)
     j_h2o_frame = h2oContext._jhc.asH2OFrame(df._jdf)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key)
Exemplo n.º 29
0
 def _as_h2o_frame_from_complex_type(h2oContext, dataframe, frame_name, full_cols=100):
     # Creates a DataFrame from an RDD of tuple/list, list or pandas.DataFrame.
     # On scala backend, to transform RDD of Product to H2OFrame, we need to know Type Tag.
     # Since there is no alternative for Product class in Python, we first transform the rdd to dataframe
     # and then transform it to H2OFrame.
     df = h2oContext._spark_session.createDataFrame(dataframe)
     j_h2o_frame = h2oContext._jhc.asH2OFrame(df._jdf, frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key, full_cols)
Exemplo n.º 30
0
 def transform_frame(self, fr):
     """
     GLRM performs A=X*Y during training.  When a new dataset is given, GLRM will perform Anew = Xnew*Y.  When
     predict is called, Xnew*Y is returned.  When transform_frame is called, Xnew is returned instead.
     :return: an H2OFrame that contains Xnew.
     """
     return H2OFrame._expr(
         expr=ExprNode("transform", ASTId(self.key), ASTId(fr.key)))._frame(
             fill_cache=True)
Exemplo n.º 31
0
def col_names_check():
    iris_wheader = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    expected_names = [
        "sepal_len", "sepal_wid", "petal_len", "petal_wid", "class"
    ]
    assert iris_wheader.col_names == expected_names, \
        "Expected {0} for column names but got {1}".format(expected_names, iris_wheader.col_names)

    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    expected_names = ["C1", "C2", "C3", "C4", "C5"]
    assert iris.col_names == expected_names, \
        "Expected {0} for column names but got {1}".format(expected_names, iris.col_names)

    df = H2OFrame.from_python(np.random.randn(100, 4).tolist(),
                              column_names=list("ABCD"),
                              column_types=["enum"] * 4)
    df.head()
    expected_names = list("ABCD")
    assert df.col_names == expected_names, \
        "Expected {} for column names but got {}".format(expected_names, df.col_names)
    assert list(df.types.values()) == ["enum"] * 4, \
        "Expected {} for column types but got {}".format(["enum"] * 4, df.types)

    df = H2OFrame(np.random.randn(100, 4).tolist())
    df.head()
    expected_names = ["C1", "C2", "C3", "C4"]
    assert df.col_names == expected_names, \
        "Expected {} for column names but got {}".format(expected_names, df.col_names)
    assert list(df.types.values()) == ["real"] * 4, \
        "Expected {} for column types but got {}".format(["real"] * 4, df.types)

    df = H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']})
    df.head()
    assert df.col_names == [
        "B"
    ], "Expected {} for column names but got {}".format(["B"], df.col_names)

    df = H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']},
                              column_names=["X"])
    df.head()
    assert df.col_names == [
        "X"
    ], "Expected {} for column names but got {}".format(["X"], df.col_names)
Exemplo n.º 32
0
def pubdev_5179():

    data = [numpy.arange(0, 20).tolist() for x in numpy.arange(0, 20).tolist()]
    fr = h2o.H2OFrame(data)
    light = H2OFrame.get_frame(fr.frame_id, full_cols=10) # only first 10 columns will be returned with data

    # verify that light frame have all columns
    assert len(light.columns) == 20
    assert len(light.types) == 20
    assert len(light._ex._cache._data) == 10 # But only data for 10 columns is available
Exemplo n.º 33
0
 def _as_h2o_frame_from_dataframe(h2oContext,
                                  dataframe,
                                  frame_name,
                                  full_cols=100):
     if dataframe.count() == 0:
         raise ValueError('Cannot transform empty H2OFrame')
     j_h2o_frame = h2oContext._jhc.asH2OFrame(dataframe._jdf, frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame, j_h2o_frame_key,
                                         full_cols)
Exemplo n.º 34
0
 def make_prediction(self, text: str) -> float:
     """
     0 - not a dad joke
     1 - dad joke
     """
     joke = H2OFrame([[text]])
     joke.col_names = ["dad_joke"]
     predict_var = self._predict(joke)
     x = predict_var.as_data_frame()
     return x['predict'][0]
Exemplo n.º 35
0
 def test_h2o_frame_2_data_frame_new(self):
     hc = self._hc
     h2o_frame = H2OFrame(file_path="../examples/smalldata/prostate.csv")
     df = hc.as_spark_frame(h2o_frame)
     self.assertEquals(df.count(), h2o_frame.nrow,
                       "Number of rows should match")
     self.assertEquals(len(df.columns), h2o_frame.ncol,
                       "Number of columns should match")
     self.assertEquals(df.columns, h2o_frame.names,
                       "Column names should match")
Exemplo n.º 36
0
 def _as_h2o_frame_from_complex_type(h2oContext,
                                     dataframe,
                                     frame_name,
                                     full_cols=-1):
     # Creates a DataFrame from an RDD of tuple/list, list or pandas.DataFrame.
     # On scala backend, to transform RDD of Product to H2OFrame, we need to know Type Tag.
     # Since there is no alternative for Product class in Python, we first transform the rdd to dataframe
     # and then transform it to H2OFrame.
     df = h2oContext._spark_session.createDataFrame(dataframe)
     key = h2oContext._jhc.asH2OFrameKeyString(df._jdf, frame_name)
     return H2OFrame.get_frame(key, full_cols=full_cols, light=True)
Exemplo n.º 37
0
 def fit(self, fr):
     assert_is_type(fr, H2OFrame)
     steps = "[%s]" % ",".join(
         quoted(step[1].to_rest(step[0]).replace('"', "'"))
         for step in self.steps)
     j = h2o.api("POST /99/Assembly",
                 data={
                     "steps": steps,
                     "frame": fr.frame_id
                 })
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Exemplo n.º 38
0
    def train(self, x=None, y=None, training_frame=None, blending_frame=None, **kwargs):
        blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=False)

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(quoted(self.metalearner_fold_column))

        super(self.__class__, self)._train(x, y, training_frame,
                                           extend_parms_fn=extend_parms,
                                           **kwargs)
Exemplo n.º 39
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.
        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps)
        j = h2o.api("POST /99/Assembly", data={"steps": steps, "frame": fr.frame_id})
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Exemplo n.º 40
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]"
     j = h2o.api("POST /99/Assembly",
                 data={
                     "steps": res,
                     "frame": fr.frame_id
                 })
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Exemplo n.º 41
0
        def rbind(*data):
            slf = data[0]
            nrow_sum = 0

            for frame in data:
                if frame.ncol != slf.ncol:
                    raise ValueError("Cannot row-bind a dataframe with %d columns to a data frame with %d columns: "
                                     "the columns must match" % (frame.ncol, slf.ncol))
                if frame.columns != slf.columns or frame.types != slf.types:
                    raise ValueError("Column names and types must match for rbind() to work")
                nrow_sum += frame.nrow

            fr = H2OFrame._expr(expr=ExprNode("rbind", slf, *data[1:]), cache=slf._ex._cache)
            fr._ex._cache.nrows = nrow_sum
            return fr
Exemplo n.º 42
0
 def _as_h2o_frame_from_RDD_Double(h2oContext, rdd, frame_name):
     j_h2o_frame = h2oContext._jhc.asH2OFrameFromPythonRDDDouble(rdd._to_java_object_rdd(), frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key)
Exemplo n.º 43
0
 def aggregated_frame(self):
     if (self._model_json is not None and
         self._model_json.get("output", {}).get("output_frame", {}).get("name") is not None):
         out_frame_name = self._model_json["output"]["output_frame"]["name"]
         return H2OFrame.get_frame(out_frame_name)
Exemplo n.º 44
0
 def _as_h2o_frame_from_RDD_Long(h2oContext, rdd, framename):
     j_h2o_frame = h2oContext._jhc.asH2OFrameFromRDDLong(rdd._to_java_object_rdd(), framename)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key)
Exemplo n.º 45
0
 def _as_h2o_frame_from_RDD_String(h2oContext, rdd, frame_name, full_cols=100):
     j_h2o_frame = h2oContext._jhc.asH2OFrameFromRDDString(rdd._to_java_object_rdd(), frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key, full_cols)
Exemplo n.º 46
0
 def _as_h2o_frame_from_dataframe(h2oContext, dataframe, framename):
     if dataframe.count() == 0:
         raise ValueError('Cannot transform empty H2OFrame')
     j_h2o_frame = h2oContext._jhc.asH2OFrame(dataframe._jdf, framename)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key)
Exemplo n.º 47
0
 def _as_h2o_frame_from_dataframe(h2oContext, dataframe, frame_name, full_cols=100):
     j_h2o_frame = h2oContext._jhc.asH2OFrame(dataframe._jdf, frame_name)
     j_h2o_frame_key = j_h2o_frame.key()
     return H2OFrame.from_java_h2o_frame(j_h2o_frame,j_h2o_frame_key, full_cols)
Exemplo n.º 48
0
 def from_java_h2o_frame(h2o_frame, h2o_frame_id):
     fr = H2OFrame.get_frame(h2o_frame_id.toString())
     fr._java_frame = h2o_frame
     fr._backed_by_java_obj = True
     return fr
Exemplo n.º 49
0
 def dataframe_2_h2oframe_by_id(dataframe_id):
     res = h2o.H2OConnection.post("dataframes/" + urllib.quote(dataframe_id) + "/h2oframe").json()
     h2oframe = H2OFrame.get_frame(res["h2oframe_id"])
     return h2oframe