Exemplo n.º 1
0
    def test_with_dictionaries_two_rows_sorted(self):
        test_df = pd.DataFrame([{
            "value": 2,
            "id": "id_1"
        }, {
            "value": 1,
            "id": "id_1"
        }])
        test_dict = {"a": test_df, "b": test_df}

        # Pass the id
        result_dict, column_id, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        self.assertEqual(result_dict["a"].iloc[0].to_dict(), {
            "value": 2,
            "id": "id_1"
        })

        # The algo should have found the correct value column
        result_dict, column_id, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, None)
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")
Exemplo n.º 2
0
    def test_with_dictionaries_one_row(self):
        test_df = pd.DataFrame([{"value": 1, "id": "id_1"}])
        test_dict = {"a": test_df, "b": test_df}

        # A kind is not allowed with dicts
        self.assertRaises(
            ValueError,
            dataframe_functions.normalize_input_to_internal_representation,
            test_dict, "id", None, "a kind", None)

        # The value must be present
        self.assertRaises(
            ValueError,
            dataframe_functions.normalize_input_to_internal_representation,
            test_dict, "id", None, None, "something other")

        # Nothing should have changed compared to the input data
        result_dict, column_id, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")
        six.assertCountEqual(self, list(test_dict.keys()),
                             list(result_dict.keys()))
        self.assertEqual(result_dict["a"].iloc[0].to_dict(), {
            "value": 1,
            "id": "id_1"
        })

        # The algo should choose the correct value column
        result_dict, column_id, column_value =\
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, None)
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")
Exemplo n.º 3
0
    def test_with_df(self):
        # give everyting
        test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", "kind", "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "kind")
        self.assertIn("a", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["id", "value", "kind"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "a"]["value"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]),
                         [0])

        # give no kind
        test_df = pd.DataFrame([{"id": 0, "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", None, "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("feature", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["id", "value", "_variables"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "feature"]["value"]), [3])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "feature"]["id"]), [0])

        # Let the function find the values
        test_df = pd.DataFrame([{"id": 0, "a": 3, "b": 5, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_df, "id", "sort", None, None)

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "_values")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("a", set(result_df[column_kind]))
        self.assertIn("b", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["_values", "_variables", "id"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "a"]["_values"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]),
                         [0])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "b"]["_values"]), [5])
        self.assertEqual(list(result_df[result_df[column_kind] == "b"]["id"]),
                         [0])
Exemplo n.º 4
0
    def test_with_dictionaries_two_rows(self):
        test_df = pd.DataFrame([{
            "value": 2,
            "sort": 2,
            "id": "id_1"
        }, {
            "value": 1,
            "sort": 1,
            "id": "id_1"
        }])
        test_dict = {"a": test_df, "b": test_df}

        # If there are more than one column, the algorithm can not choose the correct column
        self.assertRaises(
            ValueError,
            dataframe_functions.normalize_input_to_internal_representation,
            test_dict, "id", None, None, None)

        # Sorting should work
        result_dict, column_id, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", "sort", None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        # Assert sorted and without sort column
        self.assertEqual(result_dict["a"].iloc[0].to_dict(), {
            "value": 1,
            "id": "id_1"
        })
        self.assertEqual(result_dict["a"].iloc[1].to_dict(), {
            "value": 2,
            "id": "id_1"
        })

        # Assert the algo has found the correct column
        result_dict, column_id, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", "sort", None, None)
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")
Exemplo n.º 5
0
    def test_with_dictionaries_two_rows_sorted(self):
        test_df = pd.DataFrame([{
            "value": 2,
            "id": "id_1"
        }, {
            "value": 1,
            "id": "id_1"
        }])
        test_dict = {"a": test_df, "b": test_df}

        # Pass the id
        result_df, column_id, column_kind, column_value = \
            dataframe_functions.normalize_input_to_internal_representation(test_dict, "id", None, None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        self.assertEqual(
            result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), {
                "_variables": "a",
                "value": 2,
                "id": "id_1"
            })
Exemplo n.º 6
0
def extract_features(timeseries_container,
                     feature_extraction_settings=None,
                     column_id=None,
                     column_sort=None,
                     column_kind=None,
                     column_value=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_sort: The name of the sort column.
    :type column_sort: str
    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :return: The (maybe imputed) DataFrame with the extracted features.
    :rtype: pandas.DataFrame
    """

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort,
                                                                       column_kind, column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Extract the time series features for every type of time series and concatenate them together.
    all_possible_unique_id_values = set(id_value
                                        for kind, df in kind_to_df_map.items()
                                        for id_value in df[column_id])
    df_with_ids = pd.DataFrame(index=all_possible_unique_id_values)

    pool = Pool(feature_extraction_settings.n_processes)
    partial_extract_features_for_one_time_series = partial(
        _extract_features_for_one_time_series,
        column_id=column_id,
        column_value=column_value,
        settings=feature_extraction_settings)
    extracted_features = pool.map(partial_extract_features_for_one_time_series,
                                  kind_to_df_map.items())

    # Add time series features to result
    result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\
        .astype(np.float64)

    # Impute the result if requested
    if feature_extraction_settings.IMPUTE is not None:
        feature_extraction_settings.IMPUTE(result)

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(
            profiler,
            filename=feature_extraction_settings.PROFILING_FILENAME,
            sorting=feature_extraction_settings.PROFILING_SORTING)

    return result
Exemplo n.º 7
0
def extract_features(timeseries_container, default_fc_parameters=None,
                     kind_to_fc_parameters=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     parallelization=None, chunksize=defaults.CHUNKSIZE,
                     n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                     disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                     impute_function=defaults.IMPUTE_FUNCTION,
                     profile=defaults.PROFILING,
                     profiling_filename=defaults.PROFILING_FILENAME,
                     profiling_sorting=defaults.PROFILING_SORTING):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param parallelization: Either ``'per_sample'`` or ``'per_kind'``   , see
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`,
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and
                            :ref:`parallelization-label` for details.
                            Choosing None makes the algorithm look for the best parallelization technique by applying
                            some general assumptions.
    :type parallelization: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_processes: The number of processes to use for parallelisation.
    :type n_processes: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or function

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    import logging
    logging.basicConfig()

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container,
                                                                       column_id=column_id,
                                                                       column_sort=column_sort,
                                                                       column_kind=column_kind,
                                                                       column_value=column_value)

    # Use the standard setting if the user did not supply ones himself.
    if default_fc_parameters is None:
        default_fc_parameters = ComprehensiveFCParameters()

    # Choose the parallelization according to a rule-of-thumb
    if parallelization is None:
        parallelization = 'per_sample' if n_processes / 2 > len(kind_to_df_map) else 'per_kind'

    _logger.info('Parallelizing feature calculation {}'.format(parallelization))

    # If requested, do profiling (advanced feature)
    if profile:
        profiler = profiling.start_profiling()

    # Calculate the result
    if parallelization == 'per_kind':
        calculation_function = _extract_features_per_kind
    elif parallelization == 'per_sample':
        calculation_function = _extract_features_parallel_per_sample
    elif parallelization == 'serial':
        calculation_function = partial(_extract_features_per_kind, serial=True)
    else:
        raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'")

    result = calculation_function(kind_to_df_map,
                                  default_fc_parameters=default_fc_parameters,
                                  kind_to_fc_parameters=kind_to_fc_parameters,
                                  column_id=column_id,
                                  column_value=column_value,
                                  chunksize=chunksize,
                                  n_processes=n_processes,
                                  show_warnings=show_warnings,
                                  disable_progressbar=disable_progressbar,
                                  impute_function=impute_function
                                  )

    # Turn off profiling if it was turned on
    if profile:
        profiling.end_profiling(profiler, filename=profiling_filename,
                                sorting=profiling_sorting)

    return result
Exemplo n.º 8
0
def extract_features(timeseries_container, feature_extraction_settings=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     parallelization=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param parallelization: Either ``'per_sample'`` or ``'per_kind'``   , see
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`,
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and
                            :ref:`parallelization-label` for details.
    :type parallelization: str

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    import logging
    logging.basicConfig()
    
    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container,
                                                                       column_id=column_id,
                                                                       column_sort=column_sort,
                                                                       column_kind=column_kind,
                                                                       column_value=column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # Choose the parallelization according to a rule-of-thumb
    if parallelization is None:
        parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \
            else 'per_kind'

    _logger.info('Parallelizing feature calculation {}'.format(parallelization))

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Calculate the result
    if parallelization == 'per_kind':
        result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings,
                                            column_id, column_value)
    elif parallelization == 'per_sample':
        result = _extract_features_parallel_per_sample(kind_to_df_map, feature_extraction_settings,
                                                       column_id, column_value)
    elif parallelization == 'no_parallelization':
        result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings,
                                            column_id, column_value, serial=True)
    else:
        raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'")

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME,
                                sorting=feature_extraction_settings.PROFILING_SORTING)

    return result
Exemplo n.º 9
0
def extract_features(timeseries_container, default_fc_parameters=None,
                     kind_to_fc_parameters=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     chunksize=defaults.CHUNKSIZE,
                     n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                     disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                     impute_function=defaults.IMPUTE_FUNCTION,
                     profile=defaults.PROFILING,
                     profiling_filename=defaults.PROFILING_FILENAME,
                     profiling_sorting=defaults.PROFILING_SORTING):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or function

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    import logging
    logging.basicConfig()

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    df_melt, column_id, column_kind, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container=timeseries_container,
                                                                       column_id=column_id, column_kind=column_kind,
                                                                       column_sort=column_sort,
                                                                       column_value=column_value)

    # Use the standard setting if the user did not supply ones himself.
    if default_fc_parameters is None:
        default_fc_parameters = ComprehensiveFCParameters()

    # If requested, do profiling (advanced feature)
    if profile:
        profiler = profiling.start_profiling()

    with warnings.catch_warnings():
        if not show_warnings:
            warnings.simplefilter("ignore")
        else:
            warnings.simplefilter("default")

        result = _do_extraction(df=df_melt,
                                column_id=column_id, column_value=column_value, column_kind=column_kind,
                                n_jobs=n_jobs, chunksize=chunksize,
                                disable_progressbar=disable_progressbar,
                                default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters)

    result.index = result.index.astype(df_melt[column_id].dtype)

    # Impute the result if requested
    if impute_function is not None:
        impute_function(result)

    # Turn off profiling if it was turned on
    if profile:
        profiling.end_profiling(profiler, filename=profiling_filename,
                                sorting=profiling_sorting)

    return result