示例#1
0
    def as_h2o_frame(self, dataframe, framename=None):
        """
        Transforms given Spark RDD or DataFrame to H2OFrame.

        Parameters
        ----------
          dataframe : Spark RDD or DataFrame
          framename : Optional name for resulting H2OFrame

        Returns
        -------
          H2OFrame which contains data of original input Spark data structure
        """
        if isinstance(dataframe, DataFrame):
            return fc._as_h2o_frame_from_dataframe(self, dataframe, framename)
        elif isinstance(dataframe, RDD):
            # First check if the type T in RDD[T] is one of the python "primitive" types
            # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger)
            if _is_of_simple_type(dataframe):
                first = _get_first(dataframe)
                if isinstance(first, str):
                    return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename)
                elif isinstance(first, bool):
                    return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename)
                elif isinstance(dataframe.min(), int) and isinstance(dataframe.max(), int):
                    if dataframe.min() >= self._jvm.Integer.MIN_VALUE and dataframe.max() <= self._jvm.Integer.MAX_VALUE:
                        return fc._as_h2o_frame_from_RDD_Int(self, dataframe, framename)
                    else:
                        return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename)
                elif isinstance(first, float):
                    return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename)
                elif isinstance(dataframe.max(), long):
                    raise ValueError('Numbers in RDD Too Big')
            else:
                return fc._as_h2o_frame_from_complex_type(self, dataframe, framename)
示例#2
0
    def as_h2o_frame(self, dataframe, framename=None, full_cols=100):
        """
        Transforms given Spark RDD or DataFrame to H2OFrame.

        Parameters
        ----------
          dataframe : Spark RDD or DataFrame
          framename : Optional name for resulting H2OFrame
          full_cols : number of first n columns which are sent to the client together with the data

        Returns
        -------
          H2OFrame which contains data of original input Spark data structure
        """
        if isinstance(dataframe, DataFrame):
            return fc._as_h2o_frame_from_dataframe(self, dataframe, framename,
                                                   full_cols)
        elif isinstance(dataframe, RDD):
            # First check if the type T in RDD[T] is one of the python "primitive" types
            # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger)
            if _is_of_simple_type(dataframe):
                first = _get_first(dataframe)
                # Make this code compatible with python 3.6 and python 2.7
                global long
                if sys.version_info > (3, ):
                    long = int

                if isinstance(first, str):
                    return fc._as_h2o_frame_from_RDD_String(
                        self, dataframe, framename, full_cols)
                elif isinstance(first, bool):
                    return fc._as_h2o_frame_from_RDD_Bool(
                        self, dataframe, framename, full_cols)
                elif (isinstance(dataframe.min(), int)
                      and isinstance(dataframe.max(), int)) or (
                          isinstance(dataframe.min(), long)
                          and isinstance(dataframe.max(), long)):
                    if dataframe.min(
                    ) >= self._jvm.Integer.MIN_VALUE and dataframe.max(
                    ) <= self._jvm.Integer.MAX_VALUE:
                        return fc._as_h2o_frame_from_RDD_Int(
                            self, dataframe, framename, full_cols)
                    elif dataframe.min(
                    ) >= self._jvm.Long.MIN_VALUE and dataframe.max(
                    ) <= self._jvm.Long.MAX_VALUE:
                        return fc._as_h2o_frame_from_RDD_Long(
                            self, dataframe, framename, full_cols)
                    else:
                        raise ValueError('Numbers in RDD Too Big')
                elif isinstance(first, float):
                    return fc._as_h2o_frame_from_RDD_Float(
                        self, dataframe, framename, full_cols)
            else:
                return fc._as_h2o_frame_from_complex_type(
                    self, dataframe, framename, full_cols)
        else:
            raise ValueError(
                'The as_h2o_frame method expects Spark DataFrame or RDD as the input only!'
            )
示例#3
0
    def as_h2o_frame(self, dataframe, framename = None):
        """
        Transforms given Spark RDD or DataFrame to H2OFrame.

        Parameters
        ----------
          dataframe : Spark RDD or DataFrame
          framename : Optional name for resulting H2OFrame

        Returns
        -------
          H2OFrame which contains data of original input Spark data structure
        """
        if isinstance(dataframe, DataFrame):
            return fc._as_h2o_frame_from_dataframe(self, dataframe, framename)
        elif isinstance(dataframe, RDD):
            # First check if the type T in RDD[T] is one of the python "primitive" types
            # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger)
            if _is_of_simple_type(dataframe):
                first = _get_first(dataframe)
                if isinstance(first, str):
                    return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename)
                elif isinstance(first, bool):
                    return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename)
                elif isinstance(dataframe.max(), int):
                    return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename)
                elif isinstance(first, float):
                    return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename)
                elif isinstance(dataframe.max(), long):
                    raise ValueError('Numbers in RDD Too Big')
            else:
                return fc._as_h2o_frame_from_complex_type(self, dataframe, framename)
示例#4
0
    def as_h2o_frame(self, dataframe, framename=None, full_cols=100):
        """
        Transforms given Spark RDD or DataFrame to H2OFrame.

        Parameters
        ----------
          dataframe : Spark RDD or DataFrame
          framename : Optional name for resulting H2OFrame
          full_cols : number of first n columns which are sent to the client together with the data

        Returns
        -------
          H2OFrame which contains data of original input Spark data structure
        """
        if isinstance(dataframe, DataFrame):
            return fc._as_h2o_frame_from_dataframe(self, dataframe, framename, full_cols)
        elif isinstance(dataframe, RDD) and dataframe.isEmpty():
            schema = StructType([])
            empty = self._spark_session.createDataFrame(self._spark_session.sparkContext.emptyRDD(), schema)
            return fc._as_h2o_frame_from_dataframe(self, empty, framename, full_cols)
        elif isinstance(dataframe, RDD):
            # First check if the type T in RDD[T] is one of the python "primitive" types
            # String, Boolean, Int and Double (Python Long is converted to java.lang.BigInteger)
            if _is_of_simple_type(dataframe):
                first = _get_first(dataframe)
                # Make this code compatible with python 3.6 and python 2.7
                global long
                if sys.version_info > (3,):
                    long = int

                if isinstance(first, str):
                    return fc._as_h2o_frame_from_RDD_String(self, dataframe, framename, full_cols)
                elif isinstance(first, bool):
                    return fc._as_h2o_frame_from_RDD_Bool(self, dataframe, framename, full_cols)
                elif (isinstance(dataframe.min(), int) and isinstance(dataframe.max(), int)) or (isinstance(dataframe.min(), long) and isinstance(dataframe.max(), long)):
                    if dataframe.min() >= self._jvm.Integer.MIN_VALUE and dataframe.max() <= self._jvm.Integer.MAX_VALUE:
                        return fc._as_h2o_frame_from_RDD_Int(self, dataframe, framename, full_cols)
                    elif dataframe.min() >= self._jvm.Long.MIN_VALUE and dataframe.max() <= self._jvm.Long.MAX_VALUE:
                        return fc._as_h2o_frame_from_RDD_Long(self, dataframe, framename, full_cols)
                    else:
                        raise ValueError('Numbers in RDD Too Big')
                elif isinstance(first, float):
                    return fc._as_h2o_frame_from_RDD_Float(self, dataframe, framename, full_cols)
            else:
                return fc._as_h2o_frame_from_complex_type(self, dataframe, framename, full_cols)
        else:
            raise ValueError('The as_h2o_frame method expects Spark DataFrame or RDD as the input only!')