def test_impute_range(self): def get_df(): return pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) # check if values are replaced correctly df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) self.assertEqual(list(df.value_a), [0, 1, 2, 55]) self.assertEqual(list(df.value_b), [1, 200, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -134, 3]) # check for error if column key is missing df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) # check for no error if column key is too much col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55, "value_d": 55} dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) # check for error if replacement value is not finite df = get_df() col_to_max = {"value_a": 200, "value_b": np.NaN, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": np.NINF, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": np.PINF} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median)
def transform(self, X): """ Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme values from the provided dictionaries. :param X: DataFrame to impute :type X: pandas.DataFrame :return: imputed DataFrame :rtype: pandas.DataFrame :raise RuntimeError: if the replacement dictionaries are still of None type. This can happen if the transformer was not fitted. """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None: raise NotFittedError("PerColumnImputer is not fitted") X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl) return X
def test_impute_range(self): def get_df(): return pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) # check if values are replaced correctly df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) self.assertEqual(list(df.value_a), [0, 1, 2, 55]) self.assertEqual(list(df.value_b), [1, 200, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -134, 3]) # check for error if column key is missing df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) # check for no error if column key is too much col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55, "value_d": 55} dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) # check for error if replacement value is not finite df = get_df() col_to_max = {"value_a": 200, "value_b": np.NaN, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": np.NINF, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": 55} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) df = get_df() col_to_max = {"value_a": 200, "value_b": 200, "value_c": 200} col_to_min = {"value_a": -134, "value_b": -134, "value_c": -134} col_to_median = {"value_a": 55, "value_b": 55, "value_c": np.PINF} self.assertRaises(ValueError, dataframe_functions.impute_dataframe_range, df, col_to_max, col_to_min, col_to_median) df = pd.DataFrame([0, 1, 2, 3, 4], columns=["test"]) col_dict = {"test": 0} dataframe_functions.impute_dataframe_range(df, col_dict, col_dict, col_dict) self.assertEqual(df.columns, ["test"]) self.assertListEqual(list(df["test"].values), [0, 1, 2, 3, 4])
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor.set_timeseries_container( self.timeseries_container) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column( X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return self
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") self.feature_extractor.set_timeseries_container(self.timeseries_container) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return self
def transform(self, X): """ Column-wise replace all ``NaNs``, ``-inf`` and ``+inf`` in the DataFrame `X` with average/extreme values from the provided dictionaries. :param X: DataFrame to impute :type X: pandas.DataFrame :return: imputed DataFrame :rtype: pandas.DataFrame :raise RuntimeError: if the replacement dictionaries are still of None type. This can happen if the transformer was not fitted. """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if self._col_to_NINF_repl is None or self._col_to_PINF_repl is None or self._col_to_NAN_repl is None: raise NotFittedError("PerColumnImputer is not fitted") X = impute_dataframe_range(X, self._col_to_PINF_repl, self._col_to_NINF_repl, self._col_to_NAN_repl) return X
def test_impute_range(self): df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) col_to_max = {"value_b": 200} col_to_min = {"value_c": -134} col_to_median = {"value_a": 55} dataframe_functions.impute_dataframe_range(df, col_to_max, col_to_min, col_to_median) self.assertEqual(list(df.value_a), [0, 1, 2, 55]) self.assertEqual(list(df.value_b), [1, 200, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -134, 3]) df = pd.DataFrame(np.transpose([[0, 1, 2, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) dataframe_functions.impute_dataframe_range(df) self.assertEqual(list(df.value_a), [0, 1, 2, 1]) self.assertEqual(list(df.value_b), [1, 3, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -3, 3]) df = pd.DataFrame(np.transpose([[np.NaN, np.NaN, np.NaN, np.NaN], [1, np.PINF, 2, 3], [1, -3, np.NINF, 3]]), columns=["value_a", "value_b", "value_c"]) dataframe_functions.impute_dataframe_range(df) self.assertEqual(list(df.value_a), [0, 0, 0, 0]) self.assertEqual(list(df.value_b), [1, 3, 2, 3]) self.assertEqual(list(df.value_c), [1, -3, -3, 3])
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor = FeatureAugmenter( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, column_id=self.column_id, column_sort=self.column_sort, column_kind=self.column_kind, column_value=self.column_value, timeseries_container=self.timeseries_container, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profile=self.profile, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting) self.feature_selector = FeatureSelector( test_for_binary_target_binary_feature=self. test_for_binary_target_binary_feature, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature, test_for_real_target_binary_feature=self. test_for_real_target_binary_feature, test_for_real_target_real_feature=self. test_for_real_target_real_feature, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, n_jobs=self.n_jobs, chunksize=self.chunksize, ml_task=self.ml_task) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column( X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return self
def _fit_and_augment(self, X, y): """ Helper for the :func:`~fit` and :func:`~fit_transform` functions, which does most of the work described in :func:`~fit`. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: a data sample with the extraced time series features. If filter_only_tsfresh_features is False the data sample will also include the information in X. :rtype: pandas.DataFrame """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor = FeatureAugmenter( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, column_id=self.column_id, column_sort=self.column_sort, column_kind=self.column_kind, column_value=self.column_value, timeseries_container=self.timeseries_container, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profile=self.profile, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting) self.feature_selector = FeatureSelector( test_for_binary_target_binary_feature=self. test_for_binary_target_binary_feature, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature, test_for_real_target_binary_feature=self. test_for_real_target_binary_feature, test_for_real_target_real_feature=self. test_for_real_target_real_feature, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, n_jobs=self.n_jobs, chunksize=self.chunksize, ml_task=self.ml_task, multiclass=self.multiclass, n_significant=self.n_significant, multiclass_p_values=self.multiclass_p_values, ) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column( X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return X_augmented