def test_no_finite_values_yields_0(self): df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 0}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 0})
def test_ignores_non_finite_values(self): df = pd.DataFrame([0, 1, 2, 3, np.NaN, np.PINF, np.NINF], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 3}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 1.5})
def test_range_values_correct_with_uneven_length(self): df = pd.DataFrame([0, 1, 2], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 2}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 1})
def test_range_values_correct_with_uneven_length(self): df = pd.DataFrame([0, 1, 2], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 2}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 1})
def test_no_finite_values_yields_0(self): df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 0}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 0})
def test_ignores_non_finite_values(self): df = pd.DataFrame([0, 1, 2, 3, np.NaN, np.PINF, np.NINF], columns=["value"]) col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(col_to_max, {"value": 3}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 1.5})
def test_no_finite_values_yields_0(self): df = pd.DataFrame([np.NaN, np.PINF, np.NINF], columns=["value"]) with warnings.catch_warnings(record=True) as w: col_to_max, col_to_min, col_to_median = dataframe_functions.get_range_values_per_column(df) self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), "The columns ['value'] did not have any finite values. Filling with zeros.") self.assertEqual(col_to_max, {"value": 0}) self.assertEqual(col_to_min, {"value": 0}) self.assertEqual(col_to_median, {"value": 0})
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If evaluate_only_added_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor.set_timeseries_container( self.timeseries_container) if self.evaluate_only_added_features: # Do not merge the time series features to the old features X_augmented = self.feature_extractor.transform( pd.DataFrame(index=X.index)) else: X_augmented = self.feature_extractor.transform(X) if self.feature_extractor.settings.IMPUTE is impute: col_to_max, col_to_min, col_to_median = get_range_values_per_column( X_augmented) self.feature_extractor.settings.IMPUTE = partial( impute_dataframe_range, col_to_max=col_to_max, col_to_min=col_to_min, col_to_median=col_to_median) self.feature_selector.fit(X_augmented, y) return self
def fit(self, X, y=None): """ Compute the min, max and median for all columns in the DataFrame. For more information, please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function. :param X: DataFrame to calculate min, max and median values on :type X: pandas.DataFrame :param y: Unneeded. :type y: Any :return: the estimator with the computed min, max and median values :rtype: Imputer """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) col_to_max, col_to_min, col_to_median = get_range_values_per_column(X) if self.col_to_NINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()): raise ValueError( "Preset dictionary 'col_to_NINF_repl_preset' contain more keys " "than the column names in X") col_to_min.update(self.col_to_NINF_repl_preset) self._col_to_NINF_repl = col_to_min if self.col_to_PINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()): raise ValueError( "Preset dictionary 'col_to_PINF_repl_preset' contain more keys " "than the column names in X") col_to_max.update(self.col_to_PINF_repl_preset) self._col_to_PINF_repl = col_to_max if self.col_to_NAN_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()): raise ValueError( "Preset dictionary 'col_to_NAN_repl_preset' contain more keys " "than the column names in X") col_to_median.update(self.col_to_NAN_repl_preset) self._col_to_NAN_repl = col_to_median return self
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") self.feature_extractor.set_timeseries_container(self.timeseries_container) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column(X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return self
def fit(self, X, y=None): """ Compute the min, max and median for all columns in the DataFrame. For more information, please see the :func:`~tsfresh.utilities.dataframe_functions.get_range_values_per_column` function. :param X: DataFrame to calculate min, max and median values on :type X: pandas.DataFrame :param y: Unneeded. :type y: Any :return: the estimator with the computed min, max and median values :rtype: Imputer """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) col_to_max, col_to_min, col_to_median = get_range_values_per_column(X) if self.col_to_NINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NINF_repl_preset.keys()): raise ValueError("Preset dictionary 'col_to_NINF_repl_preset' contain more keys " "than the column names in X") col_to_min.update(self.col_to_NINF_repl_preset) self._col_to_NINF_repl = col_to_min if self.col_to_PINF_repl_preset is not None: if not set(X.columns) >= set(self.col_to_PINF_repl_preset.keys()): raise ValueError("Preset dictionary 'col_to_PINF_repl_preset' contain more keys " "than the column names in X") col_to_max.update(self.col_to_PINF_repl_preset) self._col_to_PINF_repl = col_to_max if self.col_to_NAN_repl_preset is not None: if not set(X.columns) >= set(self.col_to_NAN_repl_preset.keys()): raise ValueError("Preset dictionary 'col_to_NAN_repl_preset' contain more keys " "than the column names in X") col_to_median.update(self.col_to_NAN_repl_preset) self._col_to_NAN_repl = col_to_median return self
def fit(self, X, y): """ Use the given timeseries from :func:`~set_timeseries_container` and calculate features from it and add them to the data sample X (which can contain other manually-designed features). Then determine which of the features of X are relevant for the given target y. Store those relevant features internally to only extract them in the transform step. If filter_only_tsfresh_features is True, only reject newly, automatically added features. If it is False, also look at the features that are already present in the DataFrame. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant. :rtype: RelevantFeatureAugmenter """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor = FeatureAugmenter( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, column_id=self.column_id, column_sort=self.column_sort, column_kind=self.column_kind, column_value=self.column_value, timeseries_container=self.timeseries_container, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profile=self.profile, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting) self.feature_selector = FeatureSelector( test_for_binary_target_binary_feature=self. test_for_binary_target_binary_feature, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature, test_for_real_target_binary_feature=self. test_for_real_target_binary_feature, test_for_real_target_real_feature=self. test_for_real_target_real_feature, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, n_jobs=self.n_jobs, chunksize=self.chunksize, ml_task=self.ml_task) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column( X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return self
def _fit_and_augment(self, X, y): """ Helper for the :func:`~fit` and :func:`~fit_transform` functions, which does most of the work described in :func:`~fit`. :param X: The data frame without the time series features. The index rows should be present in the timeseries and in the target vector. :type X: pandas.DataFrame or numpy.array :param y: The target vector to define, which features are relevant. :type y: pandas.Series or numpy.array :return: a data sample with the extraced time series features. If filter_only_tsfresh_features is False the data sample will also include the information in X. :rtype: pandas.DataFrame """ if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor = FeatureAugmenter( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, column_id=self.column_id, column_sort=self.column_sort, column_kind=self.column_kind, column_value=self.column_value, timeseries_container=self.timeseries_container, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profile=self.profile, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting) self.feature_selector = FeatureSelector( test_for_binary_target_binary_feature=self. test_for_binary_target_binary_feature, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature, test_for_real_target_binary_feature=self. test_for_real_target_binary_feature, test_for_real_target_real_feature=self. test_for_real_target_real_feature, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, n_jobs=self.n_jobs, chunksize=self.chunksize, ml_task=self.ml_task, multiclass=self.multiclass, n_significant=self.n_significant, multiclass_p_values=self.multiclass_p_values, ) if self.filter_only_tsfresh_features: # Do not merge the time series features to the old features X_tmp = pd.DataFrame(index=X.index) else: X_tmp = X X_augmented = self.feature_extractor.transform(X_tmp) self.col_to_max, self.col_to_min, self.col_to_median = get_range_values_per_column( X_augmented) X_augmented = impute_dataframe_range(X_augmented, col_to_max=self.col_to_max, col_to_median=self.col_to_median, col_to_min=self.col_to_min) self.feature_selector.fit(X_augmented, y) return X_augmented