class FeatureAugmenterTestCase(DataTestCase): def setUp(self): self.test_df = self.create_test_data_sample() self.settings = FeatureExtractionSettings() self.settings.set_default_parameters("a") calculation_settings_mapping = { "length": self.settings.kind_to_calculation_settings_mapping["a"]["length"] } self.settings.kind_to_calculation_settings_mapping = { "a": calculation_settings_mapping.copy(), "b": calculation_settings_mapping.copy() } def test_fit_and_transform(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", settings=self.settings) # Fit should do nothing returned_df = augmenter.fit() self.assertEqual(returned_df, augmenter) self.assertRaises(RuntimeError, augmenter.transform, None) augmenter.set_timeseries_container(self.test_df) # Add features to all time series X_with_index = pd.DataFrame([{"feature_1": 1}] * 2, index=[1, 5]) X_transformed = augmenter.transform(X_with_index) # Require same shape for i in X_transformed.index: self.assertIn(i, X_with_index.index) for i in X_with_index.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (2, 3)) # Preserve old features self.assertEqual(list(X_transformed.columns), ["feature_1", "a__length", "b__length"]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print(index, row) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"])) def test_add_features_to_only_a_part(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", settings=self.settings) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[1]) X_transformed = augmenter.transform(X_with_not_all_ids) for i in X_transformed.index: self.assertIn(i, X_with_not_all_ids.index) for i in X_with_not_all_ids.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (1, 3)) self.assertEqual(X_transformed.index, [1]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print(index, row) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"]))
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :return: The (maybe imputed) DataFrame with the extracted features. :rtype: pandas.DataFrame """ # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Extract the time series features for every type of time series and concatenate them together. all_possible_unique_id_values = set(id_value for kind, df in kind_to_df_map.items() for id_value in df[column_id]) df_with_ids = pd.DataFrame(index=all_possible_unique_id_values) pool = Pool(feature_extraction_settings.n_processes) partial_extract_features_for_one_time_series = partial( _extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=feature_extraction_settings) extracted_features = pool.map(partial_extract_features_for_one_time_series, kind_to_df_map.items()) # Add time series features to result result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\ .astype(np.float64) # Impute the result if requested if feature_extraction_settings.IMPUTE is not None: feature_extraction_settings.IMPUTE(result) # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling( profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, parallelization=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param parallelization: Either ``'per_sample'`` or ``'per_kind'`` , see :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`, :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and :ref:`parallelization-label` for details. :type parallelization: str :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ import logging logging.basicConfig() # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # Choose the parallelization according to a rule-of-thumb if parallelization is None: parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \ else 'per_kind' _logger.info('Parallelizing feature calculation {}'.format(parallelization)) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Calculate the result if parallelization == 'per_kind': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'per_sample': result = _extract_features_parallel_per_sample(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'no_parallelization': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value, serial=True) else: raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'") # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result