예제 #1
0
def test_pipeline_time_series():
    X_train, y_train, X_test, y_test = split_train_test(data)

    steps = get_steps()
    pipeline = Pipeline(steps)
    X_train_final, y_train_final = pipeline.\
        fit_transform_resample(X_train, y_train)

    # Running the pipeline step by step
    X_train_temp, y_train_temp = X_train, y_train
    for _, transformer in steps:
        if hasattr(transformer, 'fit_transform_resample'):
            X_train_temp, y_train_temp = transformer.\
                fit_transform_resample(X_train_temp, y_train_temp)
        else:
            X_train_temp = transformer.\
                fit_transform(X_train_temp, y_train_temp)

    assert_almost_equal(X_train_final, X_train_temp)
    assert_almost_equal(y_train_final, y_train_temp)

    pipeline.fit(X_train, y_train)
    X_test_final, y_test_final = pipeline.transform_resample(X_test, y_test)

    X_test_temp, y_test_temp = X_test, y_test
    for _, transformer in steps:
        if hasattr(transformer, 'transform_resample'):
            X_test_temp, y_test_temp = transformer.\
                transform_resample(X_test_temp, y_test_temp)
        else:
            X_test_temp = transformer.transform(X_test_temp)

    assert_almost_equal(X_test_final, X_test_temp)
    assert_almost_equal(y_test_final, y_test_temp)
예제 #2
0
def test_grid_search_time_series():
    X_train, y_train, X_test, y_test = split_train_test(data)

    steps = get_steps() + [('classification', RandomForestClassifier())]
    pipeline = Pipeline(steps)
    param_grid = get_param_grid()
    cv = TimeSeriesSplit(n_splits=2)
    grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, verbose=0)
    grid_result = grid.fit(X_train, y_train)
예제 #3
0
    def _validate_k_fold_top(self, model, x_train, y_train, x_test, y_test):
        validation_quantities = []

        for k_min in self.k_mins:
            for k_max in self.k_maxs:
                for dist_percentage in self.dist_percentages:
                    print(
                        f"k_min, k_max, dist_percentage: {k_min}, {k_max}, {dist_percentage}"
                    )
                    pipeline_list = [
                        ('extract_subspaces',
                         SubSpaceExtraction(dist_percentage=dist_percentage,
                                            k_min=k_min,
                                            k_max=k_max,
                                            metric="euclidean",
                                            n_jobs=-1)),
                        ('compute_diagrams',
                         VietorisRipsPersistence(n_jobs=-1))
                    ]
                    top_pipeline = Pipeline(pipeline_list)

                    diagrams_train, _ = top_pipeline.fit_transform_resample(
                        x_train, y_train)

                    top_features_train = extract_topological_features(
                        diagrams_train)

                    x_train_model = np.concatenate(
                        [x_train, top_features_train], axis=1)
                    model.fit(x_train_model, y_train)

                    x_test_model = extract_features_for_prediction(
                        x_train, y_train, x_test, y_test, top_pipeline)

                    score = model.score(x_test_model, y_test)
                    output_dictionary = {
                        "k_min": k_min,
                        "k_max": k_max,
                        "dist_percentage": dist_percentage,
                        "score": score
                    }
                    validation_quantities.append(output_dictionary)

        return validation_quantities
예제 #4
0
    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of
            the point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        """

        steps = [
            ('diagram', hl.VietorisRipsPersistence(
                metric=self.metric,
                max_edge_length=self.max_edge_length,
                homology_dimensions=self.homology_dimensions,
                n_jobs=self.n_jobs)),
            ('rescaler', diag.Scaler(
                metric=self.scaler_metric,
                metric_params=self.scaler_metric_params,
                function=self.function,
                n_jobs=self.n_jobs)),
            ('filter', diag.Filtering(
                epsilon=self.epsilon,
                homology_dimensions=self.homology_dimensions)),
            ('landscape', diag.PersistenceLandscape(
                n_values=self.n_values, n_layers=self.n_layers))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #5
0
    def fit(self, X, y=None):
        """Create a giotto :class:`Pipeline` object and fit it. Then, return
        the estimator.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space)

        y : None
            Ignored.

        Returns
        -------
        self : object

        """

        steps = [('diagram',
                  hl.VietorisRipsPersistence(
                      metric=self.metric,
                      max_edge_length=self.max_edge_length,
                      homology_dimensions=self.homology_dimensions,
                      n_jobs=self.n_jobs)),
                 ('scaler',
                  diag.Scaler(metric=self.scaler_metric,
                              metric_params=self.scaler_metric_params,
                              function=self.scaler_function,
                              n_jobs=self.n_jobs)),
                 ('filter', diag.Filtering(epsilon=self.filter_epsilon)),
                 ('betticurve', diag.BettiCurve(n_values=self.n_values))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #6
0
class EntropyGenerator(BaseEstimator, TransformerMixin):
    r"""Meta transformer that returns the persistent entropy.

    Parameters
    ----------
    metric : string or callable, optional, default: ``'euclidean'``
        If set to ``'precomputed'``, input data is to be interpreted as a
        collection of distance matrices. Otherwise, input data is to be
        interpreted as a collection of point clouds (i.e. feature arrays),
        and ``metric`` determines a rule with which to calculate distances
        between pairs of instances (i.e. rows) in these arrays.
        If ``metric`` is a string, it must be one of the options allowed by
        scipy.spatial.distance.pdist for its metric parameter, or a metric
        listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS, including "euclidean",
        "manhattan", or "cosine".
        If ``metric`` is a callable function, it is called on each pair of
        instances and the resulting value recorded. The callable should take
        two arrays from the entry in X as input, and return a value indicating
        the distance between them.

    scaler_metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'landscape'`` | \
                    ``'betti'`` | ``'heat'``, optional, default: \
                    ``'bottleneck'``
        Which notion of distance between (sub)diagrams to use:

        - ``'bottleneck'`` and ``'wasserstein'`` refer to the identically named
           perfect-matching--based notions of distance.
        - ``'landscape'`` refers to the :math:`L^p` distance between
          persistence landscapes.
        - ``'betti'`` refers to the :math:`L^p` distance between Betti curves.
        - ``'heat'`` refers to the :math:`L^p` distance between
          Gaussian-smoothed diagrams.

    scaler_metric_params : dict, optional, default: {'n_samples': 200}
        Additional keyword arguments for the norm function:

        - If ``metric == 'bottleneck'`` there are no available arguments.
        - If ``metric == 'wasserstein'`` the only argument is `p` (int,
          default: ``2``).
        - If ``metric == 'betti'`` the available arguments are `p` (float,
          default: ``2.``) and `n_values` (int, default: ``100``).
        - If ``metric == 'landscape'`` the available arguments are `p`
          (float, default: ``2.``), `n_values` (int, default: ``100``) and
          `n_layers` (int, default: ``1``).
        - If ``metric == 'heat'`` the available arguments are `p` (float,
          default: ``2.``), `sigma` (float, default: ``1.``) and `n_values`
          (int, default: ``100``).

    max_edge_length : float, optional, default: np.inf
        Upper bound on the maximum value of the Vietoris-Rips filtration
        parameter.
        Points whose distance is greater than this value will never be
        connected by an edge, and topological features at scales larger than
        this value will not be detected.

    function : callable, optional, default: numpy.max
        Function used to extract a single positive scalar from the collection
        of norms of diagrams.

    n_jobs : int or None, optional, default: None
        The number of jobs to use for the computation. ``None`` means 1
        unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
        using all processors.

    homology_dimensions : list or None, optional, default: None
        When set to ``None``, all available (sub)diagrams will be filtered.
        When set to a list, it is interpreted as the list of those homology
        dimensions for which (sub)diagrams should be filtered.

    epsilon : float, optional, default: 0.
        The cutoff value controlling the amount of filtering.

    len_vector : int, optional, default: 8
        Used for performance optimization by exploiting numpy's vectorization
        capabilities.

    Attributes
    ----------
    _n_features : int
        Number of features (i.e. number of time series) passed as an input
        of the resampler.

    Examples
    --------
    >>> from giotto.meta_transformers import EntropyGenerator as eg
    >>> import numpy as np
    >>> ent = eg()
    >>> X = np.asarray([[[1,2],[2,1],[1,1]]])
    >>> X_tr = ent.fit_transform(X)
    >>> X_tr
    ... array([[ 0.69314718, -0.        ]])

    """

    def __init__(self, metric='euclidean', max_edge_length=np.inf,
                 homology_dimensions=(0, 1), scaler_metric='bottleneck',
                 scaler_metric_params=None, epsilon=0., n_jobs=None,
                 function=np.max):
        self.metric = metric
        self.max_edge_length = max_edge_length
        self.homology_dimensions = homology_dimensions
        self.n_jobs = n_jobs
        self.epsilon = epsilon
        self.function = function
        self.scaler_metric = scaler_metric
        self.scaler_metric_params = scaler_metric_params

    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        """

        steps = [
            ('diagram', hl.VietorisRipsPersistence(
                metric=self.metric,
                max_edge_length=self.max_edge_length,
                homology_dimensions=self.homology_dimensions,
                n_jobs=self.n_jobs)),
            ('rescaler', diag.Scaler(
                metric=self.scaler_metric,
                metric_params=self.scaler_metric_params,
                function=self.function,
                n_jobs=self.n_jobs)),
            ('filter', diag.Filtering(
                epsilon=self.epsilon,
                homology_dimensions=self.homology_dimensions)),
            ('entropy', diag.PersistenceEntropy(n_jobs=self.n_jobs))]

        self._pipeline = Pipeline(steps).fit(X)

        return self

    def transform(self, X, y=None):
        """Extract the persistent entropy from X.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space)

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        Xt : ndarray, shape: (n_samples, n_homology_dimensions)
            The Persistent Entropy. ``n_samples`` is not modified by the
            algorithm: the Persistent Entropy is computed per point cloud.
            ``n_homology_dimensions`` is the number of homology
            dimensions considered, i.e. the length of ``homology_dimensions``.

        """

        Xt = self._pipeline.transform(X)
        return Xt
예제 #7
0
def get_pipeline(top_feat_params):
    pipeline = Pipeline([
        ('extract_point_clouds', SubSpaceExtraction(**top_feat_params)),
        ('create_diagrams', VietorisRipsPersistence(n_jobs=-1))
    ])
    return pipeline
예제 #8
0
    def cross_validate(self, full_x, full_y, splitting_dates):
        train_split_date = splitting_dates[0]
        val_split_date = splitting_dates[1]
        end_date = splitting_dates[2]

        train_x = full_x[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]
        train_y = full_y[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]

        val_x = full_x[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]
        val_y = full_y[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]

        test_x = full_x[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]
        test_y = full_y[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]

        train_x.pop("date")
        val_x.pop("date")
        test_x.pop("date")

        train_x = train_x.values
        train_y = train_y.values
        val_x = val_x.values
        val_y = val_y.values
        test_x = test_x.values
        test_y = test_y.values

        print("START VALIDATING MODEL")
        models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y)
        best_model_params = best_combination(models_cv)
        best_model_params.pop("score")
        best_model = RandomForestClassifier(**best_model_params)

        best_model.fit(train_x, train_y)

        score = best_model.score(test_x, test_y)
        print(f'score no_top {score}')
        print(f'best model parameters no_top {best_model_params}')

        print("START VALIDATING PARAMS")
        topo_cv = self._validate_k_fold_top(best_model, train_x, train_y,
                                            val_x, val_y)
        best_topo = best_combination(topo_cv)
        best_topo.pop("score")
        best_topo_pipeline_list = [
            ('extract_subspaces', SubSpaceExtraction(**best_topo)),
            ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))
        ]
        best_topo_pipeline = Pipeline(best_topo_pipeline_list)

        train_x_for_test = np.concatenate([train_x, val_x], axis=0)
        train_y_for_test = np.concatenate([train_y, val_y], axis=0)

        diagrams_train, _ = best_topo_pipeline.fit_transform_resample(
            train_x_for_test, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TRAIN")
        top_features_train = extract_topological_features(diagrams_train)

        x_train_model = np.concatenate([train_x_for_test, top_features_train],
                                       axis=1)
        best_model.fit(x_train_model, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TEST")
        x_test_model = extract_features_for_prediction(x_train_model,
                                                       train_y_for_test,
                                                       test_x, test_y,
                                                       best_topo_pipeline)

        score_top = best_model.score(x_test_model, test_y)

        val_x_with_topo = extract_features_for_prediction(
            train_x, train_y, val_x, val_y, best_topo_pipeline)

        print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY')
        model_config_with_topo = self._validate_k_fold_model(
            x_train_model, train_y, val_x_with_topo, val_y)
        best_model_config_with_topo = best_combination(model_config_with_topo)
        best_model_config_with_topo.pop('score')

        best_model_with_topo = RandomForestClassifier(
            **best_model_config_with_topo)
        best_model_with_topo.fit(x_train_model, train_y_for_test)

        score_best_topo_and_model = best_model_with_topo.score(
            x_test_model, test_y)
        print(f'score best model and topo_feat {score_best_topo_and_model}')

        return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model
예제 #9
0
class EntropyGenerator(BaseEstimator, TransformerMixin):
    """Persistence entropies directly from point clouds.

    Implements a feature generation pipeline which computes persistence
    diagrams, scales and filters them, and then computes their persistence
    entropies.

    Parameters
    ----------
    metric : string or callable, optional, default: ``'euclidean'``
        If set to ``'precomputed'``, each entry in `X` along axis 0 is
        interpreted to be a distance matrix. Otherwise, entries are
        interpreted as feature arrays, and `metric` determines a rule with
        which to calculate distances between pairs of instances (i.e. rows)
        in these arrays.
        If `metric` is a string, it must be one of the options allowed by
        :obj:`scipy.spatial.distance.pdist` for its metric parameter, or a
        metric listed in :obj:`sklearn.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`,
        including "euclidean", "manhattan" or "cosine".
        If `metric` is a callable function, it is called on each pair of
        instances and the resulting value recorded. The callable should take
        two arrays from the entry in `X` as input, and return a value
        indicating the distance between them.

    max_edge_length : float, optional, default: ``numpy.inf``
        Upper bound on the maximum value of the Vietoris-Rips filtration
        parameter. Points whose distance is greater than this value will
        never be connected by an edge, and topological features at scales
        larger than this value will not be detected.

    homology_dimensions : iterable, optional, default: ``(0, 1)``
        Dimensions (non-negative integers) of the topological features to be
        detected.

    scaler_metric : ``'bottleneck'`` | ``'wasserstein'`` | ``'landscape'`` | \
                    ``'betti'`` | ``'heat'``, optional, default: \
                    ``'bottleneck'``
        Distance or dissimilarity function used to define the amplitude of
        a subdiagram as its distance from the diagonal diagram:

        - ``'bottleneck'`` and ``'wasserstein'`` refer to the identically named
          perfect-matching--based notions of distance.
        - ``'landscape'`` refers to the :math:`L^p` distance between
          persistence landscapes.
        - ``'betti'`` refers to the :math:`L^p` distance between Betti curves.
        - ``'heat'`` refers to the :math:`L^p` distance between
          Gaussian-smoothed diagrams.

    scaler_metric_params : dict or None, optional, default: ``None``
        Additional keyword arguments for `scaler_metric`:

        - If ``metric == 'bottleneck'`` there are no available arguments.
        - If ``metric == 'wasserstein'`` the only argument is `p` (int,
          default: ``2``).
        - If ``metric == 'betti'`` the available arguments are `p` (float,
          default: ``2.``) and `n_values` (int, default: ``100``).
        - If ``metric == 'landscape'`` the available arguments are `p`
          (float, default: ``2.``), `n_values` (int, default: ``100``) and
          `n_layers` (int, default: ``1``).
        - If ``metric == 'heat'`` the available arguments are `p` (float,
          default: ``2.``), `sigma` (float, default: ``1.``) and `n_values`
          (int, default: ``100``).

    scaler_function : callable, optional, default: ``numpy.max``
        Function used to extract a single positive scalar from the collection
        of norms of diagrams.

    filter_epsilon : float, optional, default: ``0.``
        The cutoff value controlling the amount of filtering.

    n_jobs : int or None, optional, default: ``None``
        The number of jobs to use for the computation. ``None`` means 1
        unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
        using all processors.

    Examples
    --------
    >>> from giotto.meta_transformers import EntropyGenerator as eg
    >>> import numpy as np
    >>> ent = eg()
    >>> X = np.asarray([[[1, 2], [2, 1], [1, 1]]])
    >>> Xt = ent.fit_transform(X)
    >>> print(Xt)
    [[0.69314718, -0.]]

    """
    def __init__(self,
                 metric='euclidean',
                 max_edge_length=np.inf,
                 homology_dimensions=(0, 1),
                 scaler_metric='bottleneck',
                 scaler_metric_params=None,
                 scaler_function=np.max,
                 filter_epsilon=0.,
                 n_jobs=None):
        self.metric = metric
        self.max_edge_length = max_edge_length
        self.homology_dimensions = homology_dimensions
        self.scaler_metric = scaler_metric
        self.scaler_metric_params = scaler_metric_params
        self.scaler_function = scaler_function
        self.filter_epsilon = filter_epsilon
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object

        """

        steps = [('diagram',
                  hl.VietorisRipsPersistence(
                      metric=self.metric,
                      max_edge_length=self.max_edge_length,
                      homology_dimensions=self.homology_dimensions,
                      n_jobs=self.n_jobs)),
                 ('scaler',
                  diag.Scaler(metric=self.scaler_metric,
                              metric_params=self.scaler_metric_params,
                              function=self.scaler_function,
                              n_jobs=self.n_jobs)),
                 ('filter', diag.Filtering(epsilon=self.filter_epsilon)),
                 ('entropy', diag.PersistenceEntropy(n_jobs=self.n_jobs))]

        self._pipeline = Pipeline(steps).fit(X)

        return self

    def transform(self, X, y=None):
        """Extract persistence entropies from the sample point clouds in `X`.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space).

        y : None
            There is no need for a target in a transformer, yet the pipeline
            API requires this parameter.

        Returns
        -------
        Xt : ndarray, shape (n_samples, n_homology_dimensions)
            For each point cloud in `X`, one persistence entropy per homology
            dimension in `homology_dimensions`.

        """

        Xt = self._pipeline.transform(X)
        return Xt
예제 #10
0
def varying_noise(n_steps, n_series, args_stable, args_aperiodic):
    # noise parameters
    min_noise = 0.0
    max_noise = 2.1
    step_size = 0.1
    std = 0.1

    parameters_type = "fixed"
    embedding_dimension = 2
    embedding_time_delay = 3
    n_jobs = 1

    window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1)
    # window_stride = 1

    metric = "euclidean"
    max_edge_length = 10
    homology_dimensions = [0, 1]

    epsilon = 0.0

    steps = [
        (
            "embedding",
            ts.TakensEmbedding(
                parameters_type=parameters_type,
                dimension=embedding_dimension,
                time_delay=embedding_time_delay,
                n_jobs=n_jobs,
            ),
        ),
        ("window", ts.SlidingWindow(width=window_width, stride=1)),
        (
            "diagrams",
            hl.VietorisRipsPersistence(
                metric=metric,
                max_edge_length=max_edge_length,
                homology_dimensions=homology_dimensions,
                n_jobs=n_jobs,
            ),
        ),
        ("diagrams_scaler", diag.Scaler()),
        ("diagrams_filter", diag.Filtering(epsilon=epsilon)),
    ]

    pipeline = Pipeline(steps)

    # maximal number of repetitions per noise level (for confidence intervals)
    max_itr = 5

    # data frames to save performance
    perf_train = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )
    perf_test = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )

    mb = master_bar(np.arange(min_noise, max_noise, step_size))
    for noise in mb:
        for _ in progress_bar(range(max_itr), parent=mb):
            mb.child.comment = "Repetitions per noise level"
            data = simulate_data(
                noise, std, n_steps, n_series, args_stable, args_aperiodic
            )
            # group data by type and series id
            grouped_data = data.groupby(["type", "series_id"])

            y_true = np.repeat([1, 0], n_series)
            id_train, id_test, y_train, y_test = train_test_split(
                range(2 * n_series), y_true, train_size=0.7, random_state=0
            )

            # classical k-means ###########################################################
            X = data["adults"].values.reshape((2 * n_series, -1))
            # train/test data
            X_train = X[id_train, :]
            X_test = X[id_test, :]

            # k means
            kmeans = KMeans(n_clusters=2, random_state=0)
            kmeans.fit(X_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans.labels_),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans.predict(X_test)),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            # threshold to determine whether a hole is relevant or not
            frac = 0.7

            # TDA k-means
            features = []
            for name, _ in grouped_data:
                X_filtered = pipeline.fit_transform(
                    grouped_data.get_group(name)["adults"].values
                )
                n_windows, n_points, _ = X_filtered.shape
                features.append(
                    get_mean_lifetime(X_filtered, n_windows, n_points)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0)
                    + get_max_lifetime(X_filtered, n_windows, n_points)
                    + get_amplitude(X_filtered)
                )

            # define data matrix for k-means
            X_tda = np.array(features)

            X_tda_train = X_tda[id_train, :]
            X_tda_test = X_tda[id_test, :]

            # k means
            kmeans_tda = KMeans(n_clusters=2, random_state=0)
            kmeans_tda.fit(X_tda_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans_tda.labels_),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )
        mb.first_bar.comment = "Noise level"

    # write performance metrics to disk
    with open("models/performance_metrics_train.pkl", "wb") as file:
        pickle.dump(perf_train, file)

    with open("models/performance_metrics_test.pkl", "wb") as file:
        pickle.dump(perf_test, file)