def euclidean_barycenter(X, weights=None): """Standard Euclidean barycenter computed from a set of time series. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. weights: None or array Weights of each X[i]. Must be the same size as len(X). Returns ------- numpy.array of shape (sz, d) Barycenter of the provided time series dataset. Note ---- This method requires a dataset of equal-sized time series Examples -------- >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]] >>> bar = euclidean_barycenter(time_series) >>> bar.shape (4, 1) >>> bar # doctest: +ELLIPSIS array([[ 1. ], [ 2. ], [ 3.5], [ 4.5]]) """ X_ = to_time_series_dataset(X) weights = _set_weights(weights, X_.shape[0]) return numpy.average(X_, axis=0, weights=weights)
def predict(self, X): """Predict class probability for a given set of time series. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. Returns ------- array of shape=(n_ts, ) or (n_ts, n_classes), depending on the shape of the \ label vector provided at training time. Index of the cluster each sample belongs to or class probability matrix, depending on what was provided at training time. """ X_ = to_time_series_dataset(X) n_ts, sz, d = X_.shape categorical_preds = self.model.predict( [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)], batch_size=self.batch_size, verbose=self.verbose_level) if self.categorical_y: return categorical_preds else: if categorical_preds.shape[1] == 2: categorical_preds = categorical_preds[:, 0] return self.label_binarizer.inverse_transform(categorical_preds)
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : array-like, shape (n_ts, sz, d) Training data. y : array-like, shape (n_ts, ) or (n_ts, dim_y) Target values. Returns ------- KNeighborsTimeSeriesRegressor The fitted estimator """ if self.metric in TSLEARN_VALID_METRICS: self._ts_metric = self.metric self.metric = "precomputed" X = check_array(X, allow_nd=True, force_all_finite=(self.metric != "precomputed")) X = to_time_series_dataset(X) X = check_dims(X) if self.metric == "precomputed" and hasattr(self, '_ts_metric'): self._ts_fit = X self._d = X.shape[2] self._X_fit = numpy.zeros( (self._ts_fit.shape[0], self._ts_fit.shape[0])) else: self._X_fit, self._d = to_sklearn_dataset(X, return_dim=True) super().fit(self._X_fit, y) if hasattr(self, '_ts_metric'): self.metric = self._ts_metric return self
def fit(self, X): """Compute the barycenter from a dataset of time series. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. Returns ------- numpy.array of shape (barycenter_size, d) or (sz, d) if barycenter_size is None DBA barycenter of the provided time series dataset. """ X_ = to_time_series_dataset(X) if self.barycenter_size is None: self.barycenter_size = X_.shape[1] self.weights = _set_weights(self.weights, X_.shape[0]) if self.init_barycenter is None: barycenter = self._init_avg(X_) else: barycenter = self.init_barycenter cost_prev, cost = numpy.inf, numpy.inf for it in range(self.max_iter): assign = self._petitjean_assignment(X_, barycenter) cost = self._petitjean_cost(X_, barycenter, assign) if self.verbose: print("[DBA] epoch %d, cost: %.3f" % (it + 1, cost)) barycenter = self._petitjean_update_barycenter(X_, assign) if abs(cost_prev - cost) < self.tol: break elif cost_prev < cost: warnings.warn("DBA loss is increasing while it should not be.", ConvergenceWarning) else: cost_prev = cost return barycenter
def _precompute_cross_dist(self, X, other_X=None): if other_X is None: other_X = self._ts_fit self._ts_metric = self.metric self.metric = "precomputed" metric_params = self._get_metric_params() X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, other_X, n_jobs=self.n_jobs, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, other_X, **metric_params) elif self._ts_metric == "sax": X = self._sax_preprocess(X, **metric_params) X_ = cdist_sax(X, self._sax.breakpoints_avg_, self._sax.size_fitted_, other_X, n_jobs=self.n_jobs) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) return X_
def predict_proba(self, X): """Predict the class probabilities for the provided data Parameters ---------- X : array-like, shape (n_ts, sz, d) Test samples. Returns ------- array, shape = (n_ts, n_classes) Array of predicted class probabilities """ if self.metric in TSLEARN_VALID_METRICS: check_is_fitted(self, '_ts_fit') X = check_dims(X, X_fit_dims=self._ts_fit.shape, extend=True, check_n_features_only=True) X_ = self._precompute_cross_dist(X) pred = super().predict_proba(X_) self.metric = self._ts_metric return pred else: check_is_fitted(self, '_X_fit') X = check_array(X, allow_nd=True) X = to_time_series_dataset(X) X_ = to_sklearn_dataset(X) X_ = check_dims(X_, X_fit_dims=self._X_fit.shape, extend=False) return super().predict_proba(X_)
def fit(self, X, y=None): """Fit the model using X as training data Parameters ---------- X : array-like, shape (n_ts, sz, d) Training data. """ if self.metric in TSLEARN_VALID_METRICS: self._ts_metric = self.metric self.metric = "precomputed" X = check_array(X, allow_nd=True, force_all_finite=(self.metric != "precomputed")) X = to_time_series_dataset(X) X = check_dims(X) if self.metric == "precomputed" and hasattr(self, '_ts_metric'): self._ts_fit = X self._d = X.shape[2] self._X_fit = numpy.zeros( (self._ts_fit.shape[0], self._ts_fit.shape[0])) else: self._X_fit, self._d = to_sklearn_dataset(X, return_dim=True) super().fit(self._X_fit, y) if hasattr(self, '_ts_metric'): self.metric = self._ts_metric return self
def do_kmeans(days, km_size): """ From a time series (as a list of df called days), creates km_size clusters using kmeans algo. Parameters ---------- * days: time series to cluster * km_size: number of clusters needed Returns ---------- * km: k-means object generated for the clustering, it contains info about the algorithm * y_pred: results of the clustering, it contains the clusters themselves """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) # Configure our kmeans km = TimeSeriesKMeans(n_clusters=km_size, metric="euclidean", random_state=42, verbose=False) y_pred = km.fit_predict(formatted_dataset) return km, y_pred
def cdist_gak(dataset1, dataset2=None, sigma=1.): """Compute cross-similarity matrix using Global Alignment kernel (GAK). GAK was originally presented in [1]_. Parameters ---------- dataset1 A dataset of time series dataset2 Another dataset of time series sigma : float (default 1.) Bandwidth of the internal gaussian kernel used for GAK Returns ------- numpy.ndarray Cross-similarity matrix Examples -------- >>> cdist_gak([[1, 2, 2, 3], [1., 2., 3., 4.]], sigma=2.) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS array([[ 1. , 0.656...], [ 0.656..., 1. ]]) >>> cdist_gak([[1, 2, 2], [1., 2., 3., 4.]], [[1, 2, 2, 3], [1., 2., 3., 4.]], sigma=2.) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS array([[ 0.710..., 0.297...], [ 0.656..., 1. ]]) See Also -------- gak : Compute Global Alignment kernel References ---------- .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011. """ dataset1 = to_time_series_dataset(dataset1) self_similarity = False if dataset2 is None: dataset2 = dataset1 self_similarity = True else: dataset2 = to_time_series_dataset(dataset2) return cycdist_normalized_gak(dataset1, dataset2, sigma, self_similarity=self_similarity)
def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_ts, sz, d) The query time series. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the distance to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ self_neighbors = False if n_neighbors is None: n_neighbors = self.n_neighbors if X is None: X = self._fit_X self_neighbors = True else: X = to_time_series_dataset(X) if self.metric == "dtw": cdist_fun = cdist_dtw elif self.metric in ["euclidean", "sqeuclidean", "cityblock"]: cdist_fun = lambda X, Xp: scipy_cdist(X.reshape((X.shape[0], -1)), Xp.reshape( (Xp.shape[0], -1)), metric=self.metric) else: raise ValueError( "Unrecognized time series metric string: %s (should be one of 'dtw', 'euclidean', " "'sqeuclidean' or 'cityblock')" % self.metric) full_dist_matrix = cdist_fun(X, self._fit_X) ind = numpy.argsort(full_dist_matrix, axis=1) if self_neighbors: ind = ind[:, 1:] if n_neighbors > full_dist_matrix.shape[1]: n_neighbors = full_dist_matrix.shape[1] ind = ind[:, :n_neighbors] n_ts = X.shape[0] sample_range = numpy.arange(n_ts)[:, None] dist = full_dist_matrix[sample_range, ind] if return_distance: return dist, ind else: return ind
def run(): parser = cli_parser() args = parser.parse_args() nii = image.index_img(args.input, slice(0, 30)) masker = input_data.NiftiMasker() data = masker.fit_transform(nii) ds = to_time_series_dataset(data.T[::80, :]) model = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=15) model.fit(ds) all = to_time_series_dataset(data.T) mask = model.predict(all) mask_nii = masker.inverse_transform(mask) mask.nii.to_filename(args.output)
def fit(self, X, y, sample_weight=None): sklearn_X = _prepare_ts_datasets_sklearn(X) if self.kernel == "gak" and self.gamma == "auto": self.gamma = gamma_soft_dtw(to_time_series_dataset(X)) self.kernel = _sparse_kernel_func_gak(sz=self.sz, d=self.d, gamma=self.gamma) super(TimeSeriesSVC, self).fit(sklearn_X, y, sample_weight=sample_weight) self.kernel = _sparse_kernel_func_gak(sz=self.sz, d=self.d, gamma=self.gamma, slice_support_vectors=self.support_) return self
def test_single_value_ts_no_nan(): X = to_time_series_dataset([[1, 1, 1, 1]]) standard_scaler = TimeSeriesScalerMeanVariance() assert np.sum(np.isnan(standard_scaler.fit_transform(X))) == 0 minmax_scaler = TimeSeriesScalerMinMax() assert np.sum(np.isnan(minmax_scaler.fit_transform(X))) == 0
def support_vectors_time_series_(self, X): X_ = to_time_series_dataset(X) sv = [] idx_start = 0 for cl in range(len(self.n_support_)): indices = self.support_[idx_start:idx_start + self.n_support_[cl]] sv.append(X_[indices]) idx_start += self.n_support_[cl] return sv
def _sax_preprocess(self, X, n_segments=10, alphabet_size_avg=4): # Now SAX-transform the time series if not hasattr(self, '_sax') or self._sax is None: self._sax = SymbolicAggregateApproximation( n_segments=n_segments, alphabet_size_avg=alphabet_size_avg) X = to_time_series_dataset(X) X = self._sax.fit_transform(X) return X
def predict_proba(self, X): """Predict the class probabilities for the provided data Parameters ---------- X : array-like, shape (n_ts, sz, d) Test samples. """ if self.metric in VARIABLE_LENGTH_METRICS: self._ts_metric = self.metric self.metric = "precomputed" if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "n_jobs" in metric_params.keys(): del metric_params["n_jobs"] check_is_fitted(self, '_ts_fit') X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, self._ts_fit, n_jobs=self.n_jobs, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) pred = super(KNeighborsTimeSeriesClassifier, self).predict_proba(X_) self.metric = self._ts_metric return pred else: check_is_fitted(self, '_X_fit') X = check_array(X, allow_nd=True) X = to_time_series_dataset(X) X_ = to_sklearn_dataset(X) X_ = check_dims(X_, self._X_fit, extend=False) return super(KNeighborsTimeSeriesClassifier, self).predict_proba(X_)
def fit(self, X, y=None): """Fit the model using X as training data Parameters ---------- X : array-like, shape (n_ts, sz, d) Training data. """ self._fit_X = to_time_series_dataset(X) return self
def dataImport(name): if not os.path.exists("../Classifier/TimeSeriesFiles/" + name): url = "http://www.timeseriesclassification.com/Downloads/%s.zip" % name extract_from_zip_url(url, "../Classifier/TimeSeriesFiles/" + name + "/", verbose=False) data_train = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" + name + "_TRAIN.txt") data_test = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" + name + "_TEST.txt") X_train = to_time_series_dataset(data_train[:, 1:]) y_train = data_train[:, 0].astype(numpy.int) X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(numpy.int) X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) return X_train, y_train, X_test, y_test
def clustering(df, n_cluster: int = 2, metric: str = 'softdtw', init='k-means++', random_state=1234, verbose=False, n_init=1): tsk = TimeSeriesKMeans(n_clusters=n_cluster, metric=metric, init=init, random_state=random_state, verbose=verbose, n_init=n_init) df = np.roll(df, -6, axis=0) M = to_time_series_dataset(df.T) cluster_labels = tsk.fit_predict(M) return cluster_labels
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1])
def fit(self, X, y=None): """Fit the model using X as training data Parameters ---------- X : array-like, shape (n_ts, sz, d) Training data. """ X = check_array(X, allow_nd=True) self._X_fit = to_time_series_dataset(X) return self
def fit(self, X, y=None): """Compute k-Shape clustering. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. y Ignored """ X = check_array(X, allow_nd=True) max_attempts = max(self.n_init, 10) self.labels_ = None self.inertia_ = numpy.inf self.cluster_centers_ = None self.norms_ = 0. self.norms_centroids_ = 0. self.n_iter_ = 0 X_ = to_time_series_dataset(X) self._X_fit = X_ self.norms_ = numpy.linalg.norm(X_, axis=(1, 2)) _check_initial_guess(self.init, self.n_clusters) rs = check_random_state(self.random_state) best_correct_centroids = None min_inertia = numpy.inf n_successful = 0 n_attempts = 0 while n_successful < self.n_init and n_attempts < max_attempts: try: if self.verbose and self.n_init > 1: print("Init %d" % (n_successful + 1)) n_attempts += 1 self._fit_one_init(X_, rs) if self.inertia_ < min_inertia: best_correct_centroids = self.cluster_centers_.copy() min_inertia = self.inertia_ self.n_iter_ = self._iter n_successful += 1 except EmptyClusterError: if self.verbose: print("Resumed because of empty cluster") self.norms_centroids_ = numpy.linalg.norm(self.cluster_centers_, axis=(1, 2)) self._post_fit(X_, best_correct_centroids, min_inertia) return self
def cluster_examples( train: pd.DataFrame, test: pd.DataFrame, dataset_name: str, nclusters: int = 5, n_examples: int = 3, ): """Explore: -cluster series and look at examples from each cluster """ clusterer = TimeSeriesKMeans(n_clusters=nclusters) group_cols = [train.columns[c] for c in grouping_cols[dataset_name]] train_groups = train.groupby(group_cols) test_groups = test.groupby(group_cols) max_l = max( [len(trg) + len(teg) for (_, trg), (_, teg) in zip(train_groups, test_groups)] ) timeseries = [] keys = [] for (group_name, train_group), (_, test_group) in zip(train_groups, test_groups): t_values = train_group.iloc[:, target_cols[dataset_name]].astype(float) t_values = t_values.append( test_group.iloc[:, target_cols[dataset_name]].astype(float) ) t_padded = t_values.append( pd.Series([np.nan] * (max_l - t_values.shape[0])), ) t_padded = t_padded.interpolate() assert len(t_padded) == max_l timeseries.append(t_padded) keys.append(group_name) timeseries_dataset = ts_utils.to_time_series_dataset(timeseries) clusters = clusterer.fit_predict(timeseries_dataset) plot_hist(clusters, "Distribution of Clusters") for i in range(nclusters): print(f"Looking at examples from cluster {i}") idxs = np.where(clusters == i)[0] examples = np.random.choice(idxs, size=n_examples, replace=False) for j, ex in enumerate(examples): query_list = [ f'{grp_col}=="{key}"' for grp_col, key in zip(group_cols, keys[ex]) ] values = train.query(" & ".join(query_list)).iloc[ :, target_cols[dataset_name] ] # values = values.append( # test.query(' & '.join(query_list)).iloc[:, target_cols[dataset_name]] # ) plot_ts(values, f"Example {j} of cluster {i}")
def apply_clustering(days, clust): """ Apply given clustering algorithm on given dataset. """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) y_pred = clust.predict(formatted_dataset) return y_pred
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : array-like, shape (n_ts, sz, d) Training data. y : array-like, shape (n_ts, ) Target values. """ self._fit_X = to_time_series_dataset(X) self._fit_y = numpy.array(y)
def _prepare_ts_datasets_sklearn(X): """Prepare time series datasets for sklearn. Examples -------- >>> X = to_time_series_dataset([[1, 2, 3], [2, 2, 3]]) >>> _prepare_ts_datasets_sklearn(X).shape (2, 3) """ sklearn_X = to_time_series_dataset(X) n_ts, sz, d = sklearn_X.shape return sklearn_X.reshape((n_ts, -1))
def load_dataset(self, dataset_name): """Load a dataset from the UCR/UEA archive from its name. Parameters ---------- dataset_name : str Name of the dataset. Should be in the list returned by `list_datasets` Returns ------- numpy.ndarray of shape (n_ts_train, sz, d) or None Training time series. None if unsuccessful. numpy.ndarray of integers with shape (n_ts_train, ) or None Training labels. None if unsuccessful. numpy.ndarray of shape (n_ts_test, sz, d) or None Test time series. None if unsuccessful. numpy.ndarray of integers with shape (n_ts_test, ) or None Test labels. None if unsuccessful. Examples -------- >>> X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("TwoPatterns") >>> print(X_train.shape) (1000, 128, 1) >>> print(y_train.shape) (1000,) """ full_path = os.path.join(self._data_dir, dataset_name) if os.path.isdir(full_path) and dataset_name not in self._ignore_list: if os.path.exists(os.path.join(full_path, self._filenames.get(dataset_name, dataset_name) + "_TRAIN.txt")): fname_train = self._filenames.get(dataset_name, dataset_name) + "_TRAIN.txt" fname_test = self._filenames.get(dataset_name, dataset_name) + "_TEST.txt" data_train = numpy.loadtxt(os.path.join(full_path, fname_train), delimiter=",") data_test = numpy.loadtxt(os.path.join(full_path, fname_test), delimiter=",") X_train = to_time_series_dataset(data_train[:, 1:]) y_train = data_train[:, 0].astype(numpy.int) X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(numpy.int) return X_train, y_train, X_test, y_test return None, None, None, None
def convert_mus_data_to_time_series(ts_dict): ts_dict[MUS1] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS1])) ts_dict[MUS2] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS2])) ts_dict[MUS3] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS3])) ts_dict[MUS4] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS4])) ts_dict[MUS5] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS5])) ts_dict[MUS6] = np.nan_to_num(to_time_series_dataset(ts_dict[MUS6]))
def fit(self, X, y): """Learn time-series shapelets. Parameters ---------- X : array-like of shape=(n_ts, sz, d) Time series dataset. y : array-like of shape=(n_ts, ) Time series labels. """ if self.verbose_level is not None: warnings.warn( "'verbose_level' is deprecated in version 0.2 and will be " "removed in 0.4. Use 'verbose' instead.", DeprecationWarning, stacklevel=2) self.verbose = self.verbose_level X, y = check_X_y(X, y, allow_nd=True) X = to_time_series_dataset(X) X = check_dims(X) set_random_seed(seed=self.random_state) numpy.random.seed(seed=self.random_state) n_ts, sz, d = X.shape self._X_fit_dims = X.shape self.model_ = None self.transformer_model_ = None self.locator_model_ = None self.d_ = d y_ = self._preprocess_labels(y) n_labels = len(self.classes_) if self.n_shapelets_per_size is None: sizes = grabocka_params_to_shapelet_size_dict(n_ts, sz, n_labels, self.shapelet_length, self.total_lengths) self.n_shapelets_per_size_ = sizes else: self.n_shapelets_per_size_ = self.n_shapelets_per_size self._set_model_layers(X=X, ts_sz=sz, d=d, n_classes=n_labels) self._set_weights_false_conv(d=d) self.model_.fit( [X[:, :, di].reshape((n_ts, sz, 1)) for di in range(d)], y_, batch_size=self.batch_size, epochs=self.max_iter, verbose=self.verbose ) self.n_iter_ = len(self.model_.history.history) return self
def sigma_gak(dataset, n_samples=100, random_state=None): r"""Compute sigma value to be used for GAK. This method was originally presented in [1]_. Parameters ---------- dataset A dataset of time series n_samples : int (default: 100) Number of samples on which median distance should be estimated random_state : integer or numpy.RandomState or None (default: None) The generator used to draw the samples. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- float Suggested bandwidth (:math:`\\sigma`) for the Global Alignment kernel Examples -------- >>> dataset = [[1, 2, 2, 3], [1., 2., 3., 4.]] >>> sigma_gak(dataset=dataset, ... n_samples=200, ... random_state=0) # doctest: +ELLIPSIS 2.0... See Also -------- gak : Compute Global Alignment kernel cdist_gak : Compute cross-similarity matrix using Global Alignment kernel References ---------- .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011. """ random_state = check_random_state(random_state) dataset = to_time_series_dataset(dataset) n_ts, sz, d = dataset.shape if not check_equal_size(dataset): sz = numpy.min([ts_size(ts) for ts in dataset]) if n_ts * sz < n_samples: replace = True else: replace = False sample_indices = random_state.choice(n_ts * sz, size=n_samples, replace=replace) dists = pdist(dataset[:, :sz, :].reshape((-1, d))[sample_indices], metric="euclidean") return numpy.median(dists) * numpy.sqrt(sz)