def test_with_dictionaries_two_rows_sorted(self): test_df = pd.DataFrame([{ "value": 2, "id": "id_1" }, { "value": 1, "id": "id_1" }]) test_dict = {"a": test_df, "b": test_df} # Pass the id result_dict, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value") self.assertEqual(column_value, "value") self.assertEqual(column_id, "id") self.assertEqual(result_dict["a"].iloc[0].to_dict(), { "value": 2, "id": "id_1" }) # The algo should have found the correct value column result_dict, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, None) self.assertEqual(column_value, "value") self.assertEqual(column_id, "id")
def test_with_dictionaries_one_row(self): test_df = pd.DataFrame([{"value": 1, "id": "id_1"}]) test_dict = {"a": test_df, "b": test_df} # A kind is not allowed with dicts self.assertRaises( ValueError, dataframe_functions.normalize_input_to_internal_representation, test_dict, "id", None, "a kind", None) # The value must be present self.assertRaises( ValueError, dataframe_functions.normalize_input_to_internal_representation, test_dict, "id", None, None, "something other") # Nothing should have changed compared to the input data result_dict, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value") self.assertEqual(column_value, "value") self.assertEqual(column_id, "id") six.assertCountEqual(self, list(test_dict.keys()), list(result_dict.keys())) self.assertEqual(result_dict["a"].iloc[0].to_dict(), { "value": 1, "id": "id_1" }) # The algo should choose the correct value column result_dict, column_id, column_value =\ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, None) self.assertEqual(column_value, "value") self.assertEqual(column_id, "id")
def test_with_df(self): # give everyting test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}]) result_df, column_id, column_kind, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", "kind", "value") self.assertEqual(column_id, "id") self.assertEqual(column_value, "value") self.assertEqual(column_kind, "kind") self.assertIn("a", set(result_df[column_kind])) six.assertCountEqual(self, list(result_df.columns), ["id", "value", "kind"]) self.assertEqual( list(result_df[result_df[column_kind] == "a"]["value"]), [3]) self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]), [0]) # give no kind test_df = pd.DataFrame([{"id": 0, "value": 3, "sort": 1}]) result_df, column_id, column_kind, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", None, "value") self.assertEqual(column_id, "id") self.assertEqual(column_value, "value") self.assertEqual(column_kind, "_variables") self.assertIn("feature", set(result_df[column_kind])) six.assertCountEqual(self, list(result_df.columns), ["id", "value", "_variables"]) self.assertEqual( list(result_df[result_df[column_kind] == "feature"]["value"]), [3]) self.assertEqual( list(result_df[result_df[column_kind] == "feature"]["id"]), [0]) # Let the function find the values test_df = pd.DataFrame([{"id": 0, "a": 3, "b": 5, "sort": 1}]) result_df, column_id, column_kind, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", None, None) self.assertEqual(column_id, "id") self.assertEqual(column_value, "_values") self.assertEqual(column_kind, "_variables") self.assertIn("a", set(result_df[column_kind])) self.assertIn("b", set(result_df[column_kind])) six.assertCountEqual(self, list(result_df.columns), ["_values", "_variables", "id"]) self.assertEqual( list(result_df[result_df[column_kind] == "a"]["_values"]), [3]) self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]), [0]) self.assertEqual( list(result_df[result_df[column_kind] == "b"]["_values"]), [5]) self.assertEqual(list(result_df[result_df[column_kind] == "b"]["id"]), [0])
def test_with_dictionaries_two_rows(self): test_df = pd.DataFrame([{ "value": 2, "sort": 2, "id": "id_1" }, { "value": 1, "sort": 1, "id": "id_1" }]) test_dict = {"a": test_df, "b": test_df} # If there are more than one column, the algorithm can not choose the correct column self.assertRaises( ValueError, dataframe_functions.normalize_input_to_internal_representation, test_dict, "id", None, None, None) # Sorting should work result_dict, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", "sort", None, "value") self.assertEqual(column_value, "value") self.assertEqual(column_id, "id") # Assert sorted and without sort column self.assertEqual(result_dict["a"].iloc[0].to_dict(), { "value": 1, "id": "id_1" }) self.assertEqual(result_dict["a"].iloc[1].to_dict(), { "value": 2, "id": "id_1" }) # Assert the algo has found the correct column result_dict, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", "sort", None, None) self.assertEqual(column_value, "value") self.assertEqual(column_id, "id")
def test_with_dictionaries_two_rows_sorted(self): test_df = pd.DataFrame([{ "value": 2, "id": "id_1" }, { "value": 1, "id": "id_1" }]) test_dict = {"a": test_df, "b": test_df} # Pass the id result_df, column_id, column_kind, column_value = \ dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value") self.assertEqual(column_value, "value") self.assertEqual(column_id, "id") self.assertEqual( result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), { "_variables": "a", "value": 2, "id": "id_1" })
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :return: The (maybe imputed) DataFrame with the extracted features. :rtype: pandas.DataFrame """ # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Extract the time series features for every type of time series and concatenate them together. all_possible_unique_id_values = set(id_value for kind, df in kind_to_df_map.items() for id_value in df[column_id]) df_with_ids = pd.DataFrame(index=all_possible_unique_id_values) pool = Pool(feature_extraction_settings.n_processes) partial_extract_features_for_one_time_series = partial( _extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=feature_extraction_settings) extracted_features = pool.map(partial_extract_features_for_one_time_series, kind_to_df_map.items()) # Add time series features to result result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\ .astype(np.float64) # Impute the result if requested if feature_extraction_settings.IMPUTE is not None: feature_extraction_settings.IMPUTE(result) # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling( profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
def extract_features(timeseries_container, default_fc_parameters=None, kind_to_fc_parameters=None, column_id=None, column_sort=None, column_kind=None, column_value=None, parallelization=None, chunksize=defaults.CHUNKSIZE, n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, impute_function=defaults.IMPUTE_FUNCTION, profile=defaults.PROFILING, profiling_filename=defaults.PROFILING_FILENAME, profiling_sorting=defaults.PROFILING_SORTING): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param parallelization: Either ``'per_sample'`` or ``'per_kind'`` , see :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`, :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and :ref:`parallelization-label` for details. Choosing None makes the algorithm look for the best parallelization technique by applying some general assumptions. :type parallelization: str :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param n_processes: The number of processes to use for parallelisation. :type n_processes: int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param impute_function: None, if no imputing should happen or the function to call for imputing. :type impute_function: None or function :param profile: Turn on profiling during feature extraction :type profile: bool :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for more information) :type profiling_sorting: basestring :param profiling_filename: Where to save the profiling results. :type profiling_filename: basestring :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ import logging logging.basicConfig() # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) # Use the standard setting if the user did not supply ones himself. if default_fc_parameters is None: default_fc_parameters = ComprehensiveFCParameters() # Choose the parallelization according to a rule-of-thumb if parallelization is None: parallelization = 'per_sample' if n_processes / 2 > len(kind_to_df_map) else 'per_kind' _logger.info('Parallelizing feature calculation {}'.format(parallelization)) # If requested, do profiling (advanced feature) if profile: profiler = profiling.start_profiling() # Calculate the result if parallelization == 'per_kind': calculation_function = _extract_features_per_kind elif parallelization == 'per_sample': calculation_function = _extract_features_parallel_per_sample elif parallelization == 'serial': calculation_function = partial(_extract_features_per_kind, serial=True) else: raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'") result = calculation_function(kind_to_df_map, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, column_id=column_id, column_value=column_value, chunksize=chunksize, n_processes=n_processes, show_warnings=show_warnings, disable_progressbar=disable_progressbar, impute_function=impute_function ) # Turn off profiling if it was turned on if profile: profiling.end_profiling(profiler, filename=profiling_filename, sorting=profiling_sorting) return result
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, parallelization=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param parallelization: Either ``'per_sample'`` or ``'per_kind'`` , see :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`, :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and :ref:`parallelization-label` for details. :type parallelization: str :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ import logging logging.basicConfig() # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # Choose the parallelization according to a rule-of-thumb if parallelization is None: parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \ else 'per_kind' _logger.info('Parallelizing feature calculation {}'.format(parallelization)) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Calculate the result if parallelization == 'per_kind': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'per_sample': result = _extract_features_parallel_per_sample(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'no_parallelization': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value, serial=True) else: raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'") # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
def extract_features(timeseries_container, default_fc_parameters=None, kind_to_fc_parameters=None, column_id=None, column_sort=None, column_kind=None, column_value=None, chunksize=defaults.CHUNKSIZE, n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, impute_function=defaults.IMPUTE_FUNCTION, profile=defaults.PROFILING, profiling_filename=defaults.PROFILING_FILENAME, profiling_sorting=defaults.PROFILING_SORTING): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used. :type n_jobs: int :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param impute_function: None, if no imputing should happen or the function to call for imputing. :type impute_function: None or function :param profile: Turn on profiling during feature extraction :type profile: bool :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for more information) :type profiling_sorting: basestring :param profiling_filename: Where to save the profiling results. :type profiling_filename: basestring :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ import logging logging.basicConfig() # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. df_melt, column_id, column_kind, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container=timeseries_container, column_id=column_id, column_kind=column_kind, column_sort=column_sort, column_value=column_value) # Use the standard setting if the user did not supply ones himself. if default_fc_parameters is None: default_fc_parameters = ComprehensiveFCParameters() # If requested, do profiling (advanced feature) if profile: profiler = profiling.start_profiling() with warnings.catch_warnings(): if not show_warnings: warnings.simplefilter("ignore") else: warnings.simplefilter("default") result = _do_extraction(df=df_melt, column_id=column_id, column_value=column_value, column_kind=column_kind, n_jobs=n_jobs, chunksize=chunksize, disable_progressbar=disable_progressbar, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters) result.index = result.index.astype(df_melt[column_id].dtype) # Impute the result if requested if impute_function is not None: impute_function(result) # Turn off profiling if it was turned on if profile: profiling.end_profiling(profiler, filename=profiling_filename, sorting=profiling_sorting) return result