예제 #1
0
def get_steps():
    steps = [
        ('embedding', ts.TakensEmbedding()),
        ('window', ts.SlidingWindow(width=5, stride=1)),
        ('diagram', hl.VietorisRipsPersistence()),
        ('rescaler', diag.Scaler()),
        ('filter', diag.Filtering(epsilon=0.1)),
        ('entropy', diag.PersistenceEntropy()),
        ('scaling', skprep.MinMaxScaler(copy=True)),
   ]
    return steps
예제 #2
0
    def _compute_persistence_diagrams(self, X: pd.DataFrame) -> np.ndarray:
        X_embedded = self._takens_embedding.fit_transform(X)
        self.X_embedded_dims_ = X_embedded.shape

        X_windows = self.sliding_window.fit_transform(X_embedded)
        X_diagrams = self.vietoris_rips_persistence.fit_transform(X_windows)

        diagram_scaler = diag.Scaler()
        diagram_scaler.fit(X_diagrams)

        return diagram_scaler.transform(X_diagrams)
예제 #3
0
    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of
            the point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        """

        steps = [
            ('diagram', hl.VietorisRipsPersistence(
                metric=self.metric,
                max_edge_length=self.max_edge_length,
                homology_dimensions=self.homology_dimensions,
                n_jobs=self.n_jobs)),
            ('rescaler', diag.Scaler(
                metric=self.scaler_metric,
                metric_params=self.scaler_metric_params,
                function=self.function,
                n_jobs=self.n_jobs)),
            ('filter', diag.Filtering(
                epsilon=self.epsilon,
                homology_dimensions=self.homology_dimensions)),
            ('landscape', diag.PersistenceLandscape(
                n_values=self.n_values, n_layers=self.n_layers))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #4
0
    def fit(self, X, y=None):
        """Create a giotto :class:`Pipeline` object and fit it. Then, return
        the estimator.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space)

        y : None
            Ignored.

        Returns
        -------
        self : object

        """

        steps = [('diagram',
                  hl.VietorisRipsPersistence(
                      metric=self.metric,
                      max_edge_length=self.max_edge_length,
                      homology_dimensions=self.homology_dimensions,
                      n_jobs=self.n_jobs)),
                 ('scaler',
                  diag.Scaler(metric=self.scaler_metric,
                              metric_params=self.scaler_metric_params,
                              function=self.scaler_function,
                              n_jobs=self.n_jobs)),
                 ('filter', diag.Filtering(epsilon=self.filter_epsilon)),
                 ('betticurve', diag.BettiCurve(n_values=self.n_values))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
예제 #5
0
def get_pd_from_molecule(molecule_name, structures):
    """
    INPUT:
        molecule_name: name of the molecule as given in the structres file
        structures: structures file containing information (x, y, z coordinates) for all molecules

    OUTPUT:
        X_scaled: scaled persistence diagrams
    """
    m = structures[structures['molecule_name'] == molecule_name][[
        'x', 'y', 'z'
    ]].to_numpy()
    m = m.reshape((1, m.shape[0], m.shape[1]))
    homology_dimensions = [0, 1, 2]
    persistenceDiagram = VietorisRipsPersistence(
        metric='euclidean', homology_dimensions=homology_dimensions, n_jobs=1)
    persistenceDiagram.fit(m)
    X_diagrams = persistenceDiagram.transform(m)

    diagram_scaler = diag.Scaler()
    diagram_scaler.fit(X_diagrams)
    X_scaled = diagram_scaler.transform(X_diagrams)

    return X_scaled
예제 #6
0
def tda_diagrams(path,
                 embedding_time_delay,
                 embedding_dimension,
                 window_width,
                 window_stride,
                 homology_dim=2,
                 return_betti_surface=False):
    """
    INPUT:
        path: int (number to OpenML dataset)
        embedder_time_delay: int
        embedding_dimension: int
        window_width: int
        window_stride: int
        homology_dim: int
        return_betti_surface: boolean

    OUTPUT:
        X_scaled: persistence diagrams
        df_betti_list: List of Betti curve DataFrames
    """

    df = get_dataset(path)
    df = df.get_data()[0]
    df.rename({'label': 'y', 'coord_0': 'x'}, axis='columns', inplace=True)
    df['idx'] = np.arange(len(df))

    embedder = ts.TakensEmbedding(parameters_type='search',
                                  dimension=embedding_dimension,
                                  time_delay=embedding_time_delay,
                                  n_jobs=-1)
    embedder.fit(df['x'])
    embedder_time_delay = embedder.time_delay_
    embedder_dimension = embedder.dimension_

    print('Optimal embedding time delay based on mutual information: ',
          embedder_time_delay)
    print('Optimal embedding dimension based on false nearest neighbors: ',
          embedder_dimension)

    X_embedded, y_embedded = embedder.transform_resample(df['x'], df['y'])
    sliding_window = ts.SlidingWindow(width=window_width, stride=window_stride)
    sliding_window.fit(X_embedded, y_embedded)

    X_windows, y_windows = sliding_window.transform_resample(
        X_embedded, y_embedded)

    homology_dimensions = [0, 1, 2]
    persistenceDiagram = hl.VietorisRipsPersistence(
        metric='euclidean',
        max_edge_length=10,
        homology_dimensions=homology_dimensions,
        n_jobs=-1)

    X_diagrams = persistenceDiagram.fit_transform(X_windows[:])
    diagram_scaler = diag.Scaler()
    diagram_scaler.fit(X_diagrams)
    X_scaled = diagram_scaler.transform(X_diagrams)

    persistent_entropy = diag.PersistenceEntropy()
    X_persistent_entropy = persistent_entropy.fit_transform(X_scaled)

    betti_curves = diag.BettiCurve()
    betti_curves.fit(X_scaled)
    X_betti_curves = betti_curves.transform(X_scaled)

    df_betti_list = []
    for i in homology_dimensions:
        df_betti_list.append(pd.DataFrame(X_betti_curves[:, i, :]))

    if return_betti_surface == True:
        return (X_scaled, df_betti_list, X_betti_curves)
    else:
        return (X_scaled, df_betti_list)
예제 #7
0
def varying_noise(n_steps, n_series, args_stable, args_aperiodic):
    # noise parameters
    min_noise = 0.0
    max_noise = 2.1
    step_size = 0.1
    std = 0.1

    parameters_type = "fixed"
    embedding_dimension = 2
    embedding_time_delay = 3
    n_jobs = 1

    window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1)
    # window_stride = 1

    metric = "euclidean"
    max_edge_length = 10
    homology_dimensions = [0, 1]

    epsilon = 0.0

    steps = [
        (
            "embedding",
            ts.TakensEmbedding(
                parameters_type=parameters_type,
                dimension=embedding_dimension,
                time_delay=embedding_time_delay,
                n_jobs=n_jobs,
            ),
        ),
        ("window", ts.SlidingWindow(width=window_width, stride=1)),
        (
            "diagrams",
            hl.VietorisRipsPersistence(
                metric=metric,
                max_edge_length=max_edge_length,
                homology_dimensions=homology_dimensions,
                n_jobs=n_jobs,
            ),
        ),
        ("diagrams_scaler", diag.Scaler()),
        ("diagrams_filter", diag.Filtering(epsilon=epsilon)),
    ]

    pipeline = Pipeline(steps)

    # maximal number of repetitions per noise level (for confidence intervals)
    max_itr = 5

    # data frames to save performance
    perf_train = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )
    perf_test = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )

    mb = master_bar(np.arange(min_noise, max_noise, step_size))
    for noise in mb:
        for _ in progress_bar(range(max_itr), parent=mb):
            mb.child.comment = "Repetitions per noise level"
            data = simulate_data(
                noise, std, n_steps, n_series, args_stable, args_aperiodic
            )
            # group data by type and series id
            grouped_data = data.groupby(["type", "series_id"])

            y_true = np.repeat([1, 0], n_series)
            id_train, id_test, y_train, y_test = train_test_split(
                range(2 * n_series), y_true, train_size=0.7, random_state=0
            )

            # classical k-means ###########################################################
            X = data["adults"].values.reshape((2 * n_series, -1))
            # train/test data
            X_train = X[id_train, :]
            X_test = X[id_test, :]

            # k means
            kmeans = KMeans(n_clusters=2, random_state=0)
            kmeans.fit(X_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans.labels_),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans.predict(X_test)),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            # threshold to determine whether a hole is relevant or not
            frac = 0.7

            # TDA k-means
            features = []
            for name, _ in grouped_data:
                X_filtered = pipeline.fit_transform(
                    grouped_data.get_group(name)["adults"].values
                )
                n_windows, n_points, _ = X_filtered.shape
                features.append(
                    get_mean_lifetime(X_filtered, n_windows, n_points)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0)
                    + get_max_lifetime(X_filtered, n_windows, n_points)
                    + get_amplitude(X_filtered)
                )

            # define data matrix for k-means
            X_tda = np.array(features)

            X_tda_train = X_tda[id_train, :]
            X_tda_test = X_tda[id_test, :]

            # k means
            kmeans_tda = KMeans(n_clusters=2, random_state=0)
            kmeans_tda.fit(X_tda_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans_tda.labels_),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )
        mb.first_bar.comment = "Noise level"

    # write performance metrics to disk
    with open("models/performance_metrics_train.pkl", "wb") as file:
        pickle.dump(perf_train, file)

    with open("models/performance_metrics_test.pkl", "wb") as file:
        pickle.dump(perf_test, file)