def test_impute_range(self):
        def get_df():
            return pd.DataFrame(np.transpose([[0, 1, 2, np.NaN],
                                              [1, np.PINF, 2, 3],
                                              [1, -3, np.NINF, 3]]),
                                columns=["value_a", "value_b", "value_c"])

        # check if values are replaced correctly
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median)
        self.assertEqual(list(df.value_a), [0, 1, 2, 55])
        self.assertEqual(list(df.value_b), [1, 200, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -134, 3])

        # check for error if column key is missing
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)

        # check for no error if column key is too much
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55, "value_d": 55}
        dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median)

        # check for error if replacement value is not finite
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": np.NaN, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": np.NINF, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)

        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": np.PINF}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)
Пример #2
0
    def transform(self, X):
        """
            Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme
            values from the provided dictionaries.

            :param X: DataFrame to impute
            :type X: pandas.DataFrame

            :return: imputed DataFrame
            :rtype: pandas.DataFrame
            :raise RuntimeError: if the replacement dictionaries are still of None type.
             This can happen if the transformer was not fitted.
            """

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None:
            raise NotFittedError("PerColumnImputer is not fitted")

        X = impute_dataframe_range(X, self._col_to_PINF_repl,
                                   self._col_to_NINF_repl,
                                   self._col_to_NAN_repl)

        return X
Пример #3
0
    def test_impute_range(self):
        def get_df():
            return pd.DataFrame(np.transpose([[0, 1, 2, np.NaN],
                                              [1, np.PINF, 2, 3],
                                              [1, -3, np.NINF, 3]]),
                                columns=["value_a", "value_b", "value_c"])

        # check if values are replaced correctly
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median)
        self.assertEqual(list(df.value_a), [0, 1, 2, 55])
        self.assertEqual(list(df.value_b), [1, 200, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -134, 3])

        # check for error if column key is missing
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)

        # check for no error if column key is too much
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55, "value_d": 55}
        dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median)

        # check for error if replacement value is not finite
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": np.NaN, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)
        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": np.NINF, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)

        df = get_df()
        col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200}
        col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134}
        col_to_median = {"value_a": 55, "value_b": 55, "value_c": np.PINF}
        self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range,
                          df, col_to_max, col_to_min, col_to_median)


        df = pd.DataFrame([0, 1, 2, 3, 4], columns=["test"])
        col_dict = {"test": 0}
        dataframe_functions.impute_dataframe_range(df, col_dict, col_dict, col_dict)

        self.assertEqual(df.columns, ["test"])
        self.assertListEqual(list(df["test"].values), [0, 1, 2, 3, 4])
Пример #4
0
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor.set_timeseries_container(
            self.timeseries_container)

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(
            X_augmented)
        X_augmented = impute_dataframe_range(X_augmented,
                                             col_to_max=self.col_to_max,
                                             col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return self
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        self.feature_extractor.set_timeseries_container(self.timeseries_container)

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(X_augmented)
        X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return self
Пример #6
0
        def transform(self, X):
            """
            Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme
            values from the provided dictionaries.

            :param X: DataFrame to impute
            :type X: pandas.DataFrame

            :return: imputed DataFrame
            :rtype: pandas.DataFrame
            :raise RuntimeError: if the replacement dictionaries are still of None type.
             This can happen if the transformer was not fitted.
            """

            if not isinstance(X, pd.DataFrame):
                X = pd.DataFrame(X)

            if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None:
                raise NotFittedError("PerColumnImputer is not fitted")

            X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl)

            return X
Пример #7
0
    def test_impute_range(self):
        df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3],
                                        [1, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])

        col_to_max = {"value_b": 200}
        col_to_min = {"value_c": -134}
        col_to_median = {"value_a": 55}

        dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min,
                                                   col_to_median)

        self.assertEqual(list(df.value_a), [0, 1, 2, 55])
        self.assertEqual(list(df.value_b), [1, 200, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -134, 3])

        df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3],
                                        [1, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])

        dataframe_functions.impute_dataframe_range(df)

        self.assertEqual(list(df.value_a), [0, 1, 2, 1])
        self.assertEqual(list(df.value_b), [1, 3, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -3, 3])

        df = pd.DataFrame(np.transpose([[np.NaN, np.NaN, np.NaN, np.NaN],
                                        [1, np.PINF, 2, 3],
                                        [1, -3, np.NINF, 3]]),
                          columns=["value_a", "value_b", "value_c"])

        dataframe_functions.impute_dataframe_range(df)

        self.assertEqual(list(df.value_a), [0, 0, 0, 0])
        self.assertEqual(list(df.value_b), [1, 3, 2, 3])
        self.assertEqual(list(df.value_c), [1, -3, -3, 3])
    def fit(self, X, y):
        """
        Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them
        to the data sample X (which can contain other manually-designed features).

        Then determine which of the features of X are relevant for the given target y.
        Store those relevant features internally to only extract them in the transform step.

        If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False,
        also look at the features that are already present in the DataFrame.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant.
        :rtype: RelevantFeatureAugmenter
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor = FeatureAugmenter(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value,
            timeseries_container=self.timeseries_container,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profile=self.profile,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting)

        self.feature_selector = FeatureSelector(
            test_for_binary_target_binary_feature=self.
            test_for_binary_target_binary_feature,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature,
            test_for_real_target_binary_feature=self.
            test_for_real_target_binary_feature,
            test_for_real_target_real_feature=self.
            test_for_real_target_real_feature,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            ml_task=self.ml_task)

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(
            X_augmented)
        X_augmented = impute_dataframe_range(X_augmented,
                                             col_to_max=self.col_to_max,
                                             col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return self
Пример #9
0
    def _fit_and_augment(self, X, y):
        """
        Helper for the :func:`~fit` and :func:`~fit_transform` functions, which does most of the work described in
        :func:`~fit`.

        :param X: The data frame without the time series features. The index rows should be present in the timeseries
           and in the target vector.
        :type X: pandas.DataFrame or numpy.array

        :param y: The target vector to define, which features are relevant.
        :type y: pandas.Series or numpy.array

        :return: a data sample with the extraced time series features. If filter_only_tsfresh_features is False
            the data sample will also include the information in X.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor = FeatureAugmenter(
            default_fc_parameters=self.default_fc_parameters,
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_kind=self.column_kind,
            column_value=self.column_value,
            timeseries_container=self.timeseries_container,
            chunksize=self.chunksize,
            n_jobs=self.n_jobs,
            show_warnings=self.show_warnings,
            disable_progressbar=self.disable_progressbar,
            profile=self.profile,
            profiling_filename=self.profiling_filename,
            profiling_sorting=self.profiling_sorting)

        self.feature_selector = FeatureSelector(
            test_for_binary_target_binary_feature=self.
            test_for_binary_target_binary_feature,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature,
            test_for_real_target_binary_feature=self.
            test_for_real_target_binary_feature,
            test_for_real_target_real_feature=self.
            test_for_real_target_real_feature,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            ml_task=self.ml_task,
            multiclass=self.multiclass,
            n_significant=self.n_significant,
            multiclass_p_values=self.multiclass_p_values,
        )

        if self.filter_only_tsfresh_features:
            # Do not merge the time series features to the old features
            X_tmp = pd.DataFrame(index=X.index)
        else:
            X_tmp = X

        X_augmented = self.feature_extractor.transform(X_tmp)

        self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(
            X_augmented)
        X_augmented = impute_dataframe_range(X_augmented,
                                             col_to_max=self.col_to_max,
                                             col_to_median=self.col_to_median,
                                             col_to_min=self.col_to_min)

        self.feature_selector.fit(X_augmented, y)

        return X_augmented