예제 #1
0
    def write_featureframe(self):
        """
        Writes a dataframe of data as a training dataset on HDFS in the npy format

        Returns:
            None

        Raises:
              :ValueError: if the user supplied a write mode that is not supported
              :NumpyDatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to write an
                                                                          external training dataset in the .npy format.
        """
        if self.training_dataset.training_dataset_type == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise NumpyDatasetFormatNotSupportedForExternalTrainingDatasets("The .npy dataset format is not "
                                                                            "supported for external training datasets.")
        if (self.write_mode == constants.SPARK_CONFIG.SPARK_APPEND_MODE):
            raise ValueError(
                "Append is not supported for training datasets stored in .npy format, only overwrite, "
                "set the optional argument write_mode='overwrite'")
        if not isinstance(self.df, np.ndarray):
            if isinstance(self.df, DataFrame) or isinstance(self.df, RDD):
                df = np.array(self.df.collect())
            if isinstance(df, pd.DataFrame):
                df = df.values
            if isinstance(df, list):
                df = np.array(df)
        tf = TemporaryFile()
        tf.seek(0)
        np.save(tf, df)
        tf.seek(0)
        hdfs.dump(tf.read(), self.path + constants.FEATURE_STORE.TRAINING_DATASET_NPY_SUFFIX)
예제 #2
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in numpy format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
              :CouldNotConvertDataframe: if the numpy dataset could not be converted to a spark dataframe
              :NumpyDatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an
                                                                          external training dataset in the .npy format.
        """
        if not hasattr(self, 'training_dataset') or \
                        self.training_dataset.training_dataset_type \
                        == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise NumpyDatasetFormatNotSupportedForExternalTrainingDatasets(
                "The .npy dataset format is not "
                "supported for external training datasets.")
        if not hdfs.exists(
                self.path +
                constants.FEATURE_STORE.TRAINING_DATASET_NPY_SUFFIX):
            raise TrainingDatasetNotFound(
                "Could not find a training dataset in file {}".format(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_NPY_SUFFIX))
        tf = TemporaryFile()
        data = hdfs.load(self.path +
                         constants.FEATURE_STORE.TRAINING_DATASET_NPY_SUFFIX)
        tf.write(data)
        tf.seek(0)
        np_array = np.load(tf)
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
            return np_array
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
            return np_array.tolist()
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK or \
                        self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
            if np_array.ndim != 2:
                raise CouldNotConvertDataframe(
                    "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                    "The number of dimensions are: {}".format(np_array.ndim))
            num_cols = np_array.shape[1]
            dataframe_dict = {}
            for n_col in list(range(num_cols)):
                col_name = "col_" + str(n_col)
                dataframe_dict[col_name] = np_array[:, n_col]
            pandas_df = pd.DataFrame(dataframe_dict)
            sc = spark.sparkContext
            sql_context = SQLContext(sc)
            return fs_utils._return_dataframe_type(
                sql_context.createDataFrame(pandas_df), self.dataframe_type)