Пример #1
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in hdf5 format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
              :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe
              :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an
                                                                          external training dataset in the .hdf5 format.
        """
        if not hasattr(self, 'training_dataset') or \
                        self.training_dataset.training_dataset_type \
                        == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets(
                "The .hdf5 dataset format is not "
                "supported for external training datasets.")
        if not hdfs.exists(
                self.path +
                constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX):
            raise TrainingDatasetNotFound(
                "Could not find a training dataset in file {}".format(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX))
        tf = TemporaryFile()
        data = hdfs.load(self.path +
                         constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
        tf.write(data)
        tf.seek(0)
        hdf5_file = h5py.File(tf)
        np_array = hdf5_file[self.training_dataset.name][()]
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
            return np_array
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
            return np_array.tolist()
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \
                or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
            if np_array.ndim != 2:
                raise CouldNotConvertDataframe(
                    "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                    "The number of dimensions are: {}".format(np_array.ndim))
            num_cols = np_array.shape[1]
            dataframe_dict = {}
            for n_col in list(range(num_cols)):
                col_name = "col_" + str(n_col)
                dataframe_dict[col_name] = np_array[:, n_col]
            pandas_df = pd.DataFrame(dataframe_dict)
            sc = spark.sparkContext
            sql_context = SQLContext(sc)
            return fs_utils._return_dataframe_type(
                sql_context.createDataFrame(pandas_df), self.dataframe_type)
Пример #2
0
def _convert_dataframe_to_spark(dataframe):
    """
    Helper method for converting a user-provided dataframe into a spark dataframe

    Args:
        :dataframe: the input dataframe (supported types are spark rdds, spark dataframes, pandas dataframes,
                    python 2D lists, and numpy 2D arrays)

    Returns:
        the dataframe convertd to a spark dataframe

    Raises:
        :CouldNotConvertDataframe: in case the provided dataframe could not be converted to a spark dataframe
    """
    spark = util._find_spark()
    if isinstance(dataframe, pd.DataFrame):
        sc = spark.sparkContext
        sql_context = SQLContext(sc)
        return sql_context.createDataFrame(dataframe)
    if isinstance(dataframe, list):
        dataframe = np.array(dataframe)
    if isinstance(dataframe, np.ndarray):
        if dataframe.ndim != 2:
            raise CouldNotConvertDataframe(
                "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                "The number of dimensions are: {}".format(dataframe.ndim))
        num_cols = dataframe.shape[1]
        dataframe_dict = {}
        for n_col in list(range(num_cols)):
            col_name = "col_" + str(n_col)
            dataframe_dict[col_name] = dataframe[:, n_col]
        pandas_df = pd.DataFrame(dataframe_dict)
        sc = spark.sparkContext
        sql_context = SQLContext(sc)
        return sql_context.createDataFrame(pandas_df)
    if isinstance(dataframe, RDD):
        return dataframe.toDF()
    if isinstance(dataframe, DataFrame):
        return dataframe
    raise CouldNotConvertDataframe(
        "The provided dataframe type is not recognized. Supported types are: spark rdds, spark dataframes, "
        "pandas dataframes, python 2D lists, and numpy 2D arrays. The provided dataframe has type: {}"
        .format(type(dataframe)))
Пример #3
0
def _return_dataframe_type(dataframe, dataframe_type):
    """
    Helper method for returning the dataframe in spark/pandas/numpy/python, depending on user preferences

    Args:
        :dataframe: the spark dataframe to convert
        :dataframe_type: the type to convert to (spark,pandas,numpy,python)

    Returns:
        The dataframe converted to either spark, pandas, numpy or python.

    Raises:
        :CouldNotConvertDataframe: if the dataframe type is not supported
    """
    if dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK:
        return dataframe
    if dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
        return dataframe.toPandas()
    if dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
        return np.array(dataframe.collect())
    if dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
        return dataframe.collect()

    raise CouldNotConvertDataframe("DataFrame type not supported")