def get_steps(): steps = [ ('embedding', ts.TakensEmbedding()), ('window', ts.SlidingWindow(width=5, stride=1)), ('diagram', hl.VietorisRipsPersistence()), ('rescaler', diag.Scaler()), ('filter', diag.Filtering(epsilon=0.1)), ('entropy', diag.PersistenceEntropy()), ('scaling', skprep.MinMaxScaler(copy=True)), ] return steps
def _compute_persistence_diagrams(self, X: pd.DataFrame) -> np.ndarray: X_embedded = self._takens_embedding.fit_transform(X) self.X_embedded_dims_ = X_embedded.shape X_windows = self.sliding_window.fit_transform(X_embedded) X_diagrams = self.vietoris_rips_persistence.fit_transform(X_windows) diagram_scaler = diag.Scaler() diagram_scaler.fit(X_diagrams) return diagram_scaler.transform(X_diagrams)
def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method is there to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray, shape: (n_samples, n_points, n_dimensions) Input data. ``n_samples`` is the number of point clouds, ``n_points`` is the number of points per point cloud and ``n_dimensions`` is the number of features for each point of the point cloud (i.e. the dimension of the point cloud space). y : None Ignored. Returns ------- self : object Returns self. """ steps = [ ('diagram', hl.VietorisRipsPersistence( metric=self.metric, max_edge_length=self.max_edge_length, homology_dimensions=self.homology_dimensions, n_jobs=self.n_jobs)), ('rescaler', diag.Scaler( metric=self.scaler_metric, metric_params=self.scaler_metric_params, function=self.function, n_jobs=self.n_jobs)), ('filter', diag.Filtering( epsilon=self.epsilon, homology_dimensions=self.homology_dimensions)), ('landscape', diag.PersistenceLandscape( n_values=self.n_values, n_layers=self.n_layers))] self._pipeline = Pipeline(steps).fit(X) return self
def fit(self, X, y=None): """Create a giotto :class:`Pipeline` object and fit it. Then, return the estimator. This method is there to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray, shape (n_samples, n_points, n_dimensions) Input data. ``n_samples`` is the number of point clouds, ``n_points`` is the number of points per point cloud and ``n_dimensions`` is the number of features for each point of the point cloud (i.e. the dimension of the point cloud space) y : None Ignored. Returns ------- self : object """ steps = [('diagram', hl.VietorisRipsPersistence( metric=self.metric, max_edge_length=self.max_edge_length, homology_dimensions=self.homology_dimensions, n_jobs=self.n_jobs)), ('scaler', diag.Scaler(metric=self.scaler_metric, metric_params=self.scaler_metric_params, function=self.scaler_function, n_jobs=self.n_jobs)), ('filter', diag.Filtering(epsilon=self.filter_epsilon)), ('betticurve', diag.BettiCurve(n_values=self.n_values))] self._pipeline = Pipeline(steps).fit(X) return self
def get_pd_from_molecule(molecule_name, structures): """ INPUT: molecule_name: name of the molecule as given in the structres file structures: structures file containing information (x, y, z coordinates) for all molecules OUTPUT: X_scaled: scaled persistence diagrams """ m = structures[structures['molecule_name'] == molecule_name][[ 'x', 'y', 'z' ]].to_numpy() m = m.reshape((1, m.shape[0], m.shape[1])) homology_dimensions = [0, 1, 2] persistenceDiagram = VietorisRipsPersistence( metric='euclidean', homology_dimensions=homology_dimensions, n_jobs=1) persistenceDiagram.fit(m) X_diagrams = persistenceDiagram.transform(m) diagram_scaler = diag.Scaler() diagram_scaler.fit(X_diagrams) X_scaled = diagram_scaler.transform(X_diagrams) return X_scaled
def tda_diagrams(path, embedding_time_delay, embedding_dimension, window_width, window_stride, homology_dim=2, return_betti_surface=False): """ INPUT: path: int (number to OpenML dataset) embedder_time_delay: int embedding_dimension: int window_width: int window_stride: int homology_dim: int return_betti_surface: boolean OUTPUT: X_scaled: persistence diagrams df_betti_list: List of Betti curve DataFrames """ df = get_dataset(path) df = df.get_data()[0] df.rename({'label': 'y', 'coord_0': 'x'}, axis='columns', inplace=True) df['idx'] = np.arange(len(df)) embedder = ts.TakensEmbedding(parameters_type='search', dimension=embedding_dimension, time_delay=embedding_time_delay, n_jobs=-1) embedder.fit(df['x']) embedder_time_delay = embedder.time_delay_ embedder_dimension = embedder.dimension_ print('Optimal embedding time delay based on mutual information: ', embedder_time_delay) print('Optimal embedding dimension based on false nearest neighbors: ', embedder_dimension) X_embedded, y_embedded = embedder.transform_resample(df['x'], df['y']) sliding_window = ts.SlidingWindow(width=window_width, stride=window_stride) sliding_window.fit(X_embedded, y_embedded) X_windows, y_windows = sliding_window.transform_resample( X_embedded, y_embedded) homology_dimensions = [0, 1, 2] persistenceDiagram = hl.VietorisRipsPersistence( metric='euclidean', max_edge_length=10, homology_dimensions=homology_dimensions, n_jobs=-1) X_diagrams = persistenceDiagram.fit_transform(X_windows[:]) diagram_scaler = diag.Scaler() diagram_scaler.fit(X_diagrams) X_scaled = diagram_scaler.transform(X_diagrams) persistent_entropy = diag.PersistenceEntropy() X_persistent_entropy = persistent_entropy.fit_transform(X_scaled) betti_curves = diag.BettiCurve() betti_curves.fit(X_scaled) X_betti_curves = betti_curves.transform(X_scaled) df_betti_list = [] for i in homology_dimensions: df_betti_list.append(pd.DataFrame(X_betti_curves[:, i, :])) if return_betti_surface == True: return (X_scaled, df_betti_list, X_betti_curves) else: return (X_scaled, df_betti_list)
def varying_noise(n_steps, n_series, args_stable, args_aperiodic): # noise parameters min_noise = 0.0 max_noise = 2.1 step_size = 0.1 std = 0.1 parameters_type = "fixed" embedding_dimension = 2 embedding_time_delay = 3 n_jobs = 1 window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1) # window_stride = 1 metric = "euclidean" max_edge_length = 10 homology_dimensions = [0, 1] epsilon = 0.0 steps = [ ( "embedding", ts.TakensEmbedding( parameters_type=parameters_type, dimension=embedding_dimension, time_delay=embedding_time_delay, n_jobs=n_jobs, ), ), ("window", ts.SlidingWindow(width=window_width, stride=1)), ( "diagrams", hl.VietorisRipsPersistence( metric=metric, max_edge_length=max_edge_length, homology_dimensions=homology_dimensions, n_jobs=n_jobs, ), ), ("diagrams_scaler", diag.Scaler()), ("diagrams_filter", diag.Filtering(epsilon=epsilon)), ] pipeline = Pipeline(steps) # maximal number of repetitions per noise level (for confidence intervals) max_itr = 5 # data frames to save performance perf_train = pd.DataFrame( columns={"Score", "Type", "Mean Standard Deviation of Noise"} ) perf_test = pd.DataFrame( columns={"Score", "Type", "Mean Standard Deviation of Noise"} ) mb = master_bar(np.arange(min_noise, max_noise, step_size)) for noise in mb: for _ in progress_bar(range(max_itr), parent=mb): mb.child.comment = "Repetitions per noise level" data = simulate_data( noise, std, n_steps, n_series, args_stable, args_aperiodic ) # group data by type and series id grouped_data = data.groupby(["type", "series_id"]) y_true = np.repeat([1, 0], n_series) id_train, id_test, y_train, y_test = train_test_split( range(2 * n_series), y_true, train_size=0.7, random_state=0 ) # classical k-means ########################################################### X = data["adults"].values.reshape((2 * n_series, -1)) # train/test data X_train = X[id_train, :] X_test = X[id_test, :] # k means kmeans = KMeans(n_clusters=2, random_state=0) kmeans.fit(X_train) perf_train = perf_train.append( { "Score": homogeneity_score(y_train, kmeans.labels_), "Type": "Classic", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) perf_test = perf_test.append( { "Score": homogeneity_score(y_test, kmeans.predict(X_test)), "Type": "Classic", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) # threshold to determine whether a hole is relevant or not frac = 0.7 # TDA k-means features = [] for name, _ in grouped_data: X_filtered = pipeline.fit_transform( grouped_data.get_group(name)["adults"].values ) n_windows, n_points, _ = X_filtered.shape features.append( get_mean_lifetime(X_filtered, n_windows, n_points) + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac) + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0) + get_max_lifetime(X_filtered, n_windows, n_points) + get_amplitude(X_filtered) ) # define data matrix for k-means X_tda = np.array(features) X_tda_train = X_tda[id_train, :] X_tda_test = X_tda[id_test, :] # k means kmeans_tda = KMeans(n_clusters=2, random_state=0) kmeans_tda.fit(X_tda_train) perf_train = perf_train.append( { "Score": homogeneity_score(y_train, kmeans_tda.labels_), "Type": "TDA", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) perf_test = perf_test.append( { "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)), "Type": "TDA", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) mb.first_bar.comment = "Noise level" # write performance metrics to disk with open("models/performance_metrics_train.pkl", "wb") as file: pickle.dump(perf_train, file) with open("models/performance_metrics_test.pkl", "wb") as file: pickle.dump(perf_test, file)