예제 #1
0
def get_steps():
    steps = [
        ('embedding', ts.TakensEmbedding()),
        ('window', ts.SlidingWindow(width=5, stride=1)),
        ('diagram', hl.VietorisRipsPersistence()),
        ('rescaler', diag.Scaler()),
        ('filter', diag.Filtering(epsilon=0.1)),
        ('entropy', diag.PersistenceEntropy()),
        ('scaling', skprep.MinMaxScaler(copy=True)),
   ]
    return steps
예제 #2
0
    def __init__(
        self,
        output_name: str,
        takens_parameters_type: str = "search",
        takens_dimension: int = 5,
        takens_stride: int = 1,
        takens_time_delay: int = 1,
        takens_n_jobs: int = 1,
        sliding_window_width: int = 10,
        sliding_stride: int = 1,
        diags_metric: str = "euclidean",
        diags_coeff: int = 2,
        diags_max_edge_length: float = np.inf,
        diags_homology_dimensions: Iterable = (0, 1, 2),
        diags_infinity_values: float = None,
        diags_n_jobs: int = 1,
    ):
        super().__init__(output_name)

        self._takens_embedding = TakensEmbedding(
            parameters_type=takens_parameters_type,
            dimension=takens_dimension,
            stride=takens_stride,
            time_delay=takens_time_delay,
            n_jobs=takens_n_jobs,
        )
        self.takens_dimension = takens_dimension
        self.takens_stride = takens_stride
        self.takens_time_delay = takens_time_delay
        self.takens_dimension = takens_dimension

        self.sliding_window = SlidingWindow(
            width=sliding_window_width, stride=sliding_stride
        )
        self.sliding_window_width = sliding_window_width
        self.sliding_stride = sliding_stride

        self.vietoris_rips_persistence = hl.VietorisRipsPersistence(
            metric=diags_metric,
            coeff=diags_coeff,
            max_edge_length=diags_max_edge_length,
            homology_dimensions=diags_homology_dimensions,
            infinity_values=diags_infinity_values,
            n_jobs=diags_n_jobs,
        )
예제 #3
0
    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of
            the point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        """

        steps = [
            ('diagram', hl.VietorisRipsPersistence(
                metric=self.metric,
                max_edge_length=self.max_edge_length,
                homology_dimensions=self.homology_dimensions,
                n_jobs=self.n_jobs)),
            ('rescaler', diag.Scaler(
                metric=self.scaler_metric,
                metric_params=self.scaler_metric_params,
                function=self.function,
                n_jobs=self.n_jobs)),
            ('filter', diag.Filtering(
                epsilon=self.epsilon,
                homology_dimensions=self.homology_dimensions)),
            ('landscape', diag.PersistenceLandscape(
                n_values=self.n_values, n_layers=self.n_layers))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #4
0
    def fit(self, X, y=None):
        """Create a giotto :class:`Pipeline` object and fit it. Then, return
        the estimator.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space)

        y : None
            Ignored.

        Returns
        -------
        self : object

        """

        steps = [('diagram',
                  hl.VietorisRipsPersistence(
                      metric=self.metric,
                      max_edge_length=self.max_edge_length,
                      homology_dimensions=self.homology_dimensions,
                      n_jobs=self.n_jobs)),
                 ('scaler',
                  diag.Scaler(metric=self.scaler_metric,
                              metric_params=self.scaler_metric_params,
                              function=self.scaler_function,
                              n_jobs=self.n_jobs)),
                 ('filter', diag.Filtering(epsilon=self.filter_epsilon)),
                 ('betticurve', diag.BettiCurve(n_values=self.n_values))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #5
0
def tda_diagrams(path,
                 embedding_time_delay,
                 embedding_dimension,
                 window_width,
                 window_stride,
                 homology_dim=2,
                 return_betti_surface=False):
    """
    INPUT:
        path: int (number to OpenML dataset)
        embedder_time_delay: int
        embedding_dimension: int
        window_width: int
        window_stride: int
        homology_dim: int
        return_betti_surface: boolean

    OUTPUT:
        X_scaled: persistence diagrams
        df_betti_list: List of Betti curve DataFrames
    """

    df = get_dataset(path)
    df = df.get_data()[0]
    df.rename({'label': 'y', 'coord_0': 'x'}, axis='columns', inplace=True)
    df['idx'] = np.arange(len(df))

    embedder = ts.TakensEmbedding(parameters_type='search',
                                  dimension=embedding_dimension,
                                  time_delay=embedding_time_delay,
                                  n_jobs=-1)
    embedder.fit(df['x'])
    embedder_time_delay = embedder.time_delay_
    embedder_dimension = embedder.dimension_

    print('Optimal embedding time delay based on mutual information: ',
          embedder_time_delay)
    print('Optimal embedding dimension based on false nearest neighbors: ',
          embedder_dimension)

    X_embedded, y_embedded = embedder.transform_resample(df['x'], df['y'])
    sliding_window = ts.SlidingWindow(width=window_width, stride=window_stride)
    sliding_window.fit(X_embedded, y_embedded)

    X_windows, y_windows = sliding_window.transform_resample(
        X_embedded, y_embedded)

    homology_dimensions = [0, 1, 2]
    persistenceDiagram = hl.VietorisRipsPersistence(
        metric='euclidean',
        max_edge_length=10,
        homology_dimensions=homology_dimensions,
        n_jobs=-1)

    X_diagrams = persistenceDiagram.fit_transform(X_windows[:])
    diagram_scaler = diag.Scaler()
    diagram_scaler.fit(X_diagrams)
    X_scaled = diagram_scaler.transform(X_diagrams)

    persistent_entropy = diag.PersistenceEntropy()
    X_persistent_entropy = persistent_entropy.fit_transform(X_scaled)

    betti_curves = diag.BettiCurve()
    betti_curves.fit(X_scaled)
    X_betti_curves = betti_curves.transform(X_scaled)

    df_betti_list = []
    for i in homology_dimensions:
        df_betti_list.append(pd.DataFrame(X_betti_curves[:, i, :]))

    if return_betti_surface == True:
        return (X_scaled, df_betti_list, X_betti_curves)
    else:
        return (X_scaled, df_betti_list)
예제 #6
0
def varying_noise(n_steps, n_series, args_stable, args_aperiodic):
    # noise parameters
    min_noise = 0.0
    max_noise = 2.1
    step_size = 0.1
    std = 0.1

    parameters_type = "fixed"
    embedding_dimension = 2
    embedding_time_delay = 3
    n_jobs = 1

    window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1)
    # window_stride = 1

    metric = "euclidean"
    max_edge_length = 10
    homology_dimensions = [0, 1]

    epsilon = 0.0

    steps = [
        (
            "embedding",
            ts.TakensEmbedding(
                parameters_type=parameters_type,
                dimension=embedding_dimension,
                time_delay=embedding_time_delay,
                n_jobs=n_jobs,
            ),
        ),
        ("window", ts.SlidingWindow(width=window_width, stride=1)),
        (
            "diagrams",
            hl.VietorisRipsPersistence(
                metric=metric,
                max_edge_length=max_edge_length,
                homology_dimensions=homology_dimensions,
                n_jobs=n_jobs,
            ),
        ),
        ("diagrams_scaler", diag.Scaler()),
        ("diagrams_filter", diag.Filtering(epsilon=epsilon)),
    ]

    pipeline = Pipeline(steps)

    # maximal number of repetitions per noise level (for confidence intervals)
    max_itr = 5

    # data frames to save performance
    perf_train = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )
    perf_test = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )

    mb = master_bar(np.arange(min_noise, max_noise, step_size))
    for noise in mb:
        for _ in progress_bar(range(max_itr), parent=mb):
            mb.child.comment = "Repetitions per noise level"
            data = simulate_data(
                noise, std, n_steps, n_series, args_stable, args_aperiodic
            )
            # group data by type and series id
            grouped_data = data.groupby(["type", "series_id"])

            y_true = np.repeat([1, 0], n_series)
            id_train, id_test, y_train, y_test = train_test_split(
                range(2 * n_series), y_true, train_size=0.7, random_state=0
            )

            # classical k-means ###########################################################
            X = data["adults"].values.reshape((2 * n_series, -1))
            # train/test data
            X_train = X[id_train, :]
            X_test = X[id_test, :]

            # k means
            kmeans = KMeans(n_clusters=2, random_state=0)
            kmeans.fit(X_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans.labels_),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans.predict(X_test)),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            # threshold to determine whether a hole is relevant or not
            frac = 0.7

            # TDA k-means
            features = []
            for name, _ in grouped_data:
                X_filtered = pipeline.fit_transform(
                    grouped_data.get_group(name)["adults"].values
                )
                n_windows, n_points, _ = X_filtered.shape
                features.append(
                    get_mean_lifetime(X_filtered, n_windows, n_points)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0)
                    + get_max_lifetime(X_filtered, n_windows, n_points)
                    + get_amplitude(X_filtered)
                )

            # define data matrix for k-means
            X_tda = np.array(features)

            X_tda_train = X_tda[id_train, :]
            X_tda_test = X_tda[id_test, :]

            # k means
            kmeans_tda = KMeans(n_clusters=2, random_state=0)
            kmeans_tda.fit(X_tda_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans_tda.labels_),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )
        mb.first_bar.comment = "Noise level"

    # write performance metrics to disk
    with open("models/performance_metrics_train.pkl", "wb") as file:
        pickle.dump(perf_train, file)

    with open("models/performance_metrics_test.pkl", "wb") as file:
        pickle.dump(perf_test, file)