def get_steps(): steps = [ ('embedding', ts.TakensEmbedding()), ('window', ts.SlidingWindow(width=5, stride=1)), ('diagram', hl.VietorisRipsPersistence()), ('rescaler', diag.Scaler()), ('filter', diag.Filtering(epsilon=0.1)), ('entropy', diag.PersistenceEntropy()), ('scaling', skprep.MinMaxScaler(copy=True)), ] return steps
def fit(self, X, y=None): """Do nothing and return the estimator unchanged. This method is there to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray, shape: (n_samples, n_points, n_dimensions) Input data. ``n_samples`` is the number of point clouds, ``n_points`` is the number of points per point cloud and ``n_dimensions`` is the number of features for each point of the point cloud (i.e. the dimension of the point cloud space). y : None Ignored. Returns ------- self : object Returns self. """ steps = [ ('diagram', hl.VietorisRipsPersistence( metric=self.metric, max_edge_length=self.max_edge_length, homology_dimensions=self.homology_dimensions, n_jobs=self.n_jobs)), ('rescaler', diag.Scaler( metric=self.scaler_metric, metric_params=self.scaler_metric_params, function=self.function, n_jobs=self.n_jobs)), ('filter', diag.Filtering( epsilon=self.epsilon, homology_dimensions=self.homology_dimensions)), ('landscape', diag.PersistenceLandscape( n_values=self.n_values, n_layers=self.n_layers))] self._pipeline = Pipeline(steps).fit(X) return self
def fit(self, X, y=None): """Create a giotto :class:`Pipeline` object and fit it. Then, return the estimator. This method is there to implement the usual scikit-learn API and hence work in pipelines. Parameters ---------- X : ndarray, shape (n_samples, n_points, n_dimensions) Input data. ``n_samples`` is the number of point clouds, ``n_points`` is the number of points per point cloud and ``n_dimensions`` is the number of features for each point of the point cloud (i.e. the dimension of the point cloud space) y : None Ignored. Returns ------- self : object """ steps = [('diagram', hl.VietorisRipsPersistence( metric=self.metric, max_edge_length=self.max_edge_length, homology_dimensions=self.homology_dimensions, n_jobs=self.n_jobs)), ('scaler', diag.Scaler(metric=self.scaler_metric, metric_params=self.scaler_metric_params, function=self.scaler_function, n_jobs=self.n_jobs)), ('filter', diag.Filtering(epsilon=self.filter_epsilon)), ('betticurve', diag.BettiCurve(n_values=self.n_values))] self._pipeline = Pipeline(steps).fit(X) return self
def varying_noise(n_steps, n_series, args_stable, args_aperiodic): # noise parameters min_noise = 0.0 max_noise = 2.1 step_size = 0.1 std = 0.1 parameters_type = "fixed" embedding_dimension = 2 embedding_time_delay = 3 n_jobs = 1 window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1) # window_stride = 1 metric = "euclidean" max_edge_length = 10 homology_dimensions = [0, 1] epsilon = 0.0 steps = [ ( "embedding", ts.TakensEmbedding( parameters_type=parameters_type, dimension=embedding_dimension, time_delay=embedding_time_delay, n_jobs=n_jobs, ), ), ("window", ts.SlidingWindow(width=window_width, stride=1)), ( "diagrams", hl.VietorisRipsPersistence( metric=metric, max_edge_length=max_edge_length, homology_dimensions=homology_dimensions, n_jobs=n_jobs, ), ), ("diagrams_scaler", diag.Scaler()), ("diagrams_filter", diag.Filtering(epsilon=epsilon)), ] pipeline = Pipeline(steps) # maximal number of repetitions per noise level (for confidence intervals) max_itr = 5 # data frames to save performance perf_train = pd.DataFrame( columns={"Score", "Type", "Mean Standard Deviation of Noise"} ) perf_test = pd.DataFrame( columns={"Score", "Type", "Mean Standard Deviation of Noise"} ) mb = master_bar(np.arange(min_noise, max_noise, step_size)) for noise in mb: for _ in progress_bar(range(max_itr), parent=mb): mb.child.comment = "Repetitions per noise level" data = simulate_data( noise, std, n_steps, n_series, args_stable, args_aperiodic ) # group data by type and series id grouped_data = data.groupby(["type", "series_id"]) y_true = np.repeat([1, 0], n_series) id_train, id_test, y_train, y_test = train_test_split( range(2 * n_series), y_true, train_size=0.7, random_state=0 ) # classical k-means ########################################################### X = data["adults"].values.reshape((2 * n_series, -1)) # train/test data X_train = X[id_train, :] X_test = X[id_test, :] # k means kmeans = KMeans(n_clusters=2, random_state=0) kmeans.fit(X_train) perf_train = perf_train.append( { "Score": homogeneity_score(y_train, kmeans.labels_), "Type": "Classic", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) perf_test = perf_test.append( { "Score": homogeneity_score(y_test, kmeans.predict(X_test)), "Type": "Classic", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) # threshold to determine whether a hole is relevant or not frac = 0.7 # TDA k-means features = [] for name, _ in grouped_data: X_filtered = pipeline.fit_transform( grouped_data.get_group(name)["adults"].values ) n_windows, n_points, _ = X_filtered.shape features.append( get_mean_lifetime(X_filtered, n_windows, n_points) + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac) + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0) + get_max_lifetime(X_filtered, n_windows, n_points) + get_amplitude(X_filtered) ) # define data matrix for k-means X_tda = np.array(features) X_tda_train = X_tda[id_train, :] X_tda_test = X_tda[id_test, :] # k means kmeans_tda = KMeans(n_clusters=2, random_state=0) kmeans_tda.fit(X_tda_train) perf_train = perf_train.append( { "Score": homogeneity_score(y_train, kmeans_tda.labels_), "Type": "TDA", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) perf_test = perf_test.append( { "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)), "Type": "TDA", "Mean Standard Deviation of Noise": noise, }, ignore_index=True, ) mb.first_bar.comment = "Noise level" # write performance metrics to disk with open("models/performance_metrics_train.pkl", "wb") as file: pickle.dump(perf_train, file) with open("models/performance_metrics_test.pkl", "wb") as file: pickle.dump(perf_test, file)