def test_kmeans_univariate(self): data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] sample_points = [0, 2, 4, 6, 8, 10] fd = FDataGrid(data_matrix, sample_points) init = np.array([[0, 0, 0, 0, 0, 0], [2, 1, -1, 0.5, 0, -0.5]]) init_fd = FDataGrid(init, sample_points) kmeans = KMeans(init=init_fd) kmeans.fit(fd) np.testing.assert_array_equal(kmeans.predict(fd), np.array([0, 0, 0, 1])) np.testing.assert_allclose(kmeans.transform(fd), np.array([[2.98142397, 9.23534876], [0.68718427, 6.50960828], [3.31243449, 4.39222798], [6.49679408, 0.]])) centers = FDataGrid(data_matrix=np.array( [[0.16666667, 0.16666667, 0.83333333, 2., 1.66666667, 1.16666667], [-0.5, -0.5, -0.5, -1., -1., -1.]]), sample_points=sample_points) np.testing.assert_array_almost_equal( kmeans.cluster_centers_.data_matrix, centers.data_matrix) np.testing.assert_allclose(kmeans.score(fd), np.array([-20.33333333])) np.testing.assert_array_equal(kmeans.n_iter_, np.array([3.]))
def test_fuzzy_kmeans_univariate(self): data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] sample_points = [0, 2, 4, 6, 8, 10] fd = FDataGrid(data_matrix, sample_points) fuzzy_kmeans = FuzzyKMeans() fuzzy_kmeans.fit(fd) np.testing.assert_array_equal( fuzzy_kmeans.predict(fd), np.array([[0.965, 0.035], [0.94, 0.06], [0.227, 0.773], [0.049, 0.951]])) np.testing.assert_allclose( fuzzy_kmeans.transform(fd), np.array([[1.49228858, 7.87898791], [1.29380155, 5.12696975], [4.85542339, 2.63309793], [7.77455633, 1.75920889]])) centers = FDataGrid(data_matrix=np.array([[ 0.7065078, 0.7065078, 1.45508111, 2.46698825, 1.98143302, 1.48206743 ], [ -0.69456401, -0.69456401, -0.49444239, -0.19713489, -0.19872214, -0.39844583 ]]), sample_points=sample_points) np.testing.assert_allclose(fuzzy_kmeans.cluster_centers_.data_matrix, centers.data_matrix) np.testing.assert_allclose(fuzzy_kmeans.score(fd), np.array([-13.928868250627902])) np.testing.assert_array_equal(fuzzy_kmeans.n_iter_, np.array([18.]))
def test_concatenate(self): sample1 = np.arange(0, 10) sample2 = np.arange(10, 20) fd1 = FDataGrid([sample1]).to_basis(Fourier(n_basis=5)) fd2 = FDataGrid([sample2]).to_basis(Fourier(n_basis=5)) fd = concatenate([fd1, fd2]) np.testing.assert_equal(fd.n_samples, 2) np.testing.assert_equal(fd.dim_codomain, 1) np.testing.assert_equal(fd.dim_domain, 1) np.testing.assert_array_equal(fd.coefficients, np.concatenate( [fd1.coefficients, fd2.coefficients]))
def test_fuzzy_kmeans_univariate(self): data_matrix = [[1, 1, 2, 3, 2.5, 2], [0.5, 0.5, 1, 2, 1.5, 1], [-1, -1, -0.5, 1, 1, 0.5], [-0.5, -0.5, -0.5, -1, -1, -1]] sample_points = [0, 2, 4, 6, 8, 10] fd = FDataGrid(data_matrix, sample_points) fuzzy_kmeans = FuzzyCMeans() fuzzy_kmeans.fit(fd) np.testing.assert_array_equal(fuzzy_kmeans.predict(fd).round(3), np.array([[0.965, 0.035], [0.94, 0.06], [0.227, 0.773], [0.049, 0.951]])) np.testing.assert_allclose(fuzzy_kmeans.transform(fd).round(3), np.array([[1.492, 7.879], [1.294, 5.127], [4.856, 2.633], [7.775, 1.759]])) centers = np.array([[0.707, 0.707, 1.455, 2.467, 1.981, 1.482], [-0.695, -0.695, -0.494, -0.197, -0.199, -0.398]]) np.testing.assert_allclose( fuzzy_kmeans.cluster_centers_.data_matrix[..., 0].round(3), centers) np.testing.assert_allclose(fuzzy_kmeans.score(fd), np.array([-12.025179])) self.assertEqual(fuzzy_kmeans.n_iter_, 19)
def __init__(self, target: pd.DataFrame, ref: pd.DataFrame, var: str, mpt: float = 0.001, **kwargs): y1, y2, x = estimate_pdfs(target, ref, var) landmarks = [peaks(y, x, mph=mpt * y.max(), **kwargs) for y in [y1, y2]] plabels = np.concatenate([[0 for i in range(len(landmarks[0]))], [1 for i in range(len(landmarks[1]))]]) landmarks = np.array([x for sl in landmarks for x in sl]) self.landmarks = match_landmarks(landmarks, plabels) self.original_functions = FDataGrid([y1, y2], grid_points=x) self.warping_function = None self.adjusted_functions = None self.landmark_shift_deltas = None
def __init__(self, grid: FDataGrid, smoothed=False): self.init_grid = grid.copy() self.sample_points = self.init_grid.sample_points[0] self._nSeries = self.init_grid.data_matrix.shape[0] self._nObs = self.init_grid.data_matrix.shape[1] self._nVar = self.init_grid.data_matrix.shape[2] self.coordinates_grids = list(self.init_grid.coordinates) self.coordinate_names = self.init_grid.coordinate_names self._smoothed = smoothed if self._smoothed == True: self.coordinates_grids_dx1 = [ grid.derivative(order=1) for grid in self.coordinates_grids ] self.coordinates_grids_dx2 = [ grid.derivative(order=2) for grid in self.coordinates_grids ] self._scaled = False
def compute_arc_length(self): if not self._smoothed: _ = self.smooth_grids() dx1_mat = np.empty([self._nVar, self._nObs]) result_matrix = np.empty([self._nSeries, self._nObs]) for i in range(self._nSeries): for j in range(self._nVar): dx1_mat[j, :] = self.coordinates_grids_dx1[j].data_matrix[i, :, 0] result_matrix[i, :] = _calculate_arc_length(DX1=dx1_mat, t=self.sample_points) return FDataGrid(data_matrix=result_matrix, sample_points=self.sample_points, dataset_label="arc_length")
def test_qr(self): t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) fd = FDataGrid(data_matrix=x, sample_points=t) smoother = smoothing.BasisSmoother(basis=basis, smoothing_parameter=10, penalty=2, method='qr', return_basis=True) fd_basis = smoother.fit_transform(fd) np.testing.assert_array_almost_equal( fd_basis.coefficients.round(2), np.array([[0.60, 0.47, 0.20, -0.07, -0.20]]))
def data_to_basis(self, X, fit_fPCA=True): """Project the data to basis functions. Parameters ---------- X: array, shape (n,n_points,d) Array of paths. It is a 3-dimensional array, containing the coordinates in R^d of n piecewise linear paths, each composed of n_points. fit_fPCA: boolean, default=True If n_basis='fPCA' and fit_fPCA=True, the basis functions are fitted to be the functional principal components of X. Returns ------- fd_basis: object Instance of skfda.representation.basis.FDataBasis, the basis representation of X, where the type of basis is determined by self.n_basis. """ grid_points = np.linspace(0, 1, X.shape[1]) fd = FDataGrid(X, grid_points) basis_vec = [] for i in range(X.shape[2]): if self.basis_type == 'bspline': basis_vec.append(BSpline(n_basis=self.nbasis)) elif self.basis_type == 'fourier': basis_vec.append(Fourier(n_basis=self.nbasis)) elif self.basis_type == 'fPCA': basis_vec.append(BSpline(n_basis=7)) basis = VectorValued(basis_vec) fd_basis = fd.to_basis(basis) if self.basis_type == 'fPCA': if fit_fPCA: self.fpca_basis = self.fpca_basis.fit(fd_basis) fd_basis = self.fpca_basis.transform(fd_basis) return fd_basis
def test_cholesky(self): t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = BSpline((0, 1), n_basis=5) fd = FDataGrid(data_matrix=x, sample_points=t) smoother = smoothing.BasisSmoother( basis=basis, smoothing_parameter=10, regularization=TikhonovRegularization( LinearDifferentialOperator(2)), method='cholesky', return_basis=True) fd_basis = smoother.fit_transform(fd) np.testing.assert_array_almost_equal( fd_basis.coefficients.round(2), np.array([[0.60, 0.47, 0.20, -0.07, -0.20]]))
def test_monomial_smoothing(self): # It does not have much sense to apply smoothing in this basic case # where the fit is very good but its just for testing purposes t = np.linspace(0, 1, 5) x = np.sin(2 * np.pi * t) + np.cos(2 * np.pi * t) basis = Monomial(n_basis=4) fd = FDataGrid(data_matrix=x, sample_points=t) smoother = smoothing.BasisSmoother(basis=basis, smoothing_parameter=1, penalty=2, return_basis=True) fd_basis = smoother.fit_transform(fd) # These results where extracted from the R package fda np.testing.assert_array_almost_equal( fd_basis.coefficients.round(2), np.array([[0.61, -0.88, 0.06, 0.02]]))
def scale_grids(self, axis=0, with_std=False): ''' Perform scaling for each time series for each variables if axis=0 it will minus each time series by its mean,else it will minus every timestep by the mean of each time series evaluated at that timestep ''' if self._scaled == True: print("Data was already scaled, no additionnal scale done") return def _scale(x, with_std=False): xi = np.array(x) mean = np.mean(xi) if with_std: sd = np.std(xi) return (xi - mean) / sd else: return xi - mean if axis > 1: raise ValueError("axis should be either 0 or 1") for grid in self.coordinates_grids: for i in range(grid.data_matrix.shape[axis]): if axis == 0: grid.data_matrix[i, :, 0] = _scale(grid.data_matrix[i, :, 0], with_std=with_std) else: grid.data_matrix[:, i, 0] = _scale(grid.data_matrix[:, i, 0], with_std=with_std) grid = FDataGrid(data_matrix=grid.data_matrix, sample_points=self.sample_points, domain_range=grid.domain_range, dataset_label=grid.dataset_label) self._scaled = True return None
from skfda import datasets from skfda.representation.grid import FDataGrid from skfda.ml.clustering.base_kmeans import KMeans from skfda.exploratory.visualization.clustering_plots import * ################################################################################## # First, the Canadian Weather dataset is downloaded from the package 'fda' in CRAN. # It contains a FDataGrid with daily temperatures and precipitations, that is, it # has a 2-dimensional image. We are interested only in the daily average temperatures, # so another FDataGrid is constructed with the desired values. dataset = datasets.fetch_weather() fd = dataset["data"] fd_temperatures = FDataGrid(data_matrix=fd.data_matrix[:, :, 0], sample_points=fd.sample_points, dataset_label=fd.dataset_label, axes_labels=fd.axes_labels[0:2]) # The desired FDataGrid only contains 10 random samples, so that the example provides # clearer plots. indices_samples = np.array([1, 3, 5, 10, 14, 17, 21, 25, 27, 30]) fd = fd_temperatures[indices_samples] ############################################################################################ # The data is plotted to show the curves we are working with. They are divided according to the # target. In this case, it includes the different climates to which the weather stations belong to. climate_by_sample = [dataset["target"][i] for i in indices_samples] # Note that the samples chosen belong to three of the four possible target groups. By # coincidence, these three groups correspond to indices 1, 2, 3, that is why the indices # (´climate_by_sample´) are decremented in 1. In case of reproducing the example with other
class LandmarkReg: """ One technique for handling technical variation in cytometry data is local normalisation by aligning the probability density function of some data to a reference sample. This should be applied to a population immediately prior to applying a gate. The alignment algorithm is inspired by previous work [1, 2] and is performed as follows: 1. The probability density function of some target data and a reference sample are estimated using a convolution based fast kernel density estimation algorithm (KDEpy.FFTKDE) 2. Landmarks are identified in both samples as peaks of local maximal density. 3. The peaks from both target and reference are combined and clustered using K means clustering; the number of clusters is chosen as the number of peaks identified in the target 4. Unique pairings of peaks between samples, closest to the centroid of a cluster, are generated and used as landmarks. 5. Landmark registration is performed using the Scikit-FDA package to generate a warping function, with the target location being the mean between paired peaks 6. The warping function is applied to the target data, generating a new adjusted vector with high density regions matched to the reference sample [1] Hahne F, Khodabakhshi AH, Bashashati A, Wong CJ, Gascoyne RD, Weng AP, Seyfert-Margolis V, Bourcier K, Asare A, Lumley T, Gentleman R, Brinkman RR. Per-channel basis normalization methods for flow cytometry data. Cytometry A. 2010 Feb;77(2):121-31. doi: 10.1002/cyto.a.20823. PMID: 19899135; PMCID: PMC3648208. [2] Finak G, Jiang W, Krouse K, et al. High-throughput flow cytometry data normalization for clinical trials. Cytometry A. 2014;85(3):277-286. doi:10.1002/cyto.a.22433 Parameters ---------- target: Pandas.DataFrame Target data to be transformed; must contain column corresponding to 'var' ref: Pandas.DataFrame Reference data for computing alignment; must contain column corresponding to 'var' var: str Name of the target variable to align mpt: float (default=0.001) Minimum peak threshold; peaks that are less than the given percentage of the 'highest' peak (max density) will be ignored. Use this to remove small perturbations. kwargs: Additional keyword arguments passed to cytopy.flow.fda_norm.peaks call Attributes ---------- landmarks: numpy.ndarray (2, n) array, where n is the number of clusters. Order conserved between samples; first row is peaks from target, second row is peaks from reference. original_functions: skfda.representation.grid.FDataGrid Original PDFs for target and reference warping_function: skfda.representation.grid.FDataGrid Warping function adjusted_functions: skfda.representation.grid.FDataGrid Registered curves following function compostion of original PDFs and warping function landmark_shift_deltas: numpy.ndarray Corresponding shifts to align the landmarks of the PDFs described in original_functions """ def __init__(self, target: pd.DataFrame, ref: pd.DataFrame, var: str, mpt: float = 0.001, **kwargs): y1, y2, x = estimate_pdfs(target, ref, var) landmarks = [peaks(y, x, mph=mpt * y.max(), **kwargs) for y in [y1, y2]] plabels = np.concatenate([[0 for i in range(len(landmarks[0]))], [1 for i in range(len(landmarks[1]))]]) landmarks = np.array([x for sl in landmarks for x in sl]) self.landmarks = match_landmarks(landmarks, plabels) self.original_functions = FDataGrid([y1, y2], grid_points=x) self.warping_function = None self.adjusted_functions = None self.landmark_shift_deltas = None def __call__(self): """ Calculate the warping function, registered curves and landmark shift deltas Returns ------- self """ self.warping_function = landmark_registration_warping(self.original_functions, self.landmarks, location=np.mean(self.landmarks, axis=0)) self.adjusted_functions = self.original_functions.compose(self.warping_function) self.landmark_shift_deltas = landmark_shift_deltas(self.original_functions, self.landmarks) return self def plot_warping(self, ax: list or None = None): """ Generate a figure that plots the PDFs prior to landmark registration, the warping function, and the registered curves. Parameters ---------- ax: Matplotlib.Axes, optional Returns ------- Matplotlib.Axes """ assert self.warping_function is not None, "Call object prior to plot" ax = ax or plt.subplots(1, 3, figsize=(15, 4))[1] assert len(ax) == 3, "Must provide exactly 3 axis objects" self.original_functions.plot(axes=ax[0]) ax[0].set_title("Before") self.warping_function.plot(axes=ax[1]) ax[1].set_title("Warping function") self.adjusted_functions.plot(axes=ax[2]) ax[2].set_title("After") ax[0].legend(labels=["Target", "Reference"]) return ax def shift_data(self, x: np.ndarray): """ Provided the original vector of data to transform, use the warping function to normalise the data and align the reference. Parameters ---------- x: numpy.ndarray Returns ------- numpy.ndarray Raises ------ AssertionError If the class has not been called and therefore a warping function has not been defined """ assert self.warping_function is not None, "No warping function defined" return self.warping_function.evaluate(x)[1].reshape(-1) def plot_shift(self, x: np.ndarray, ax: plt.Axes or None = None): """ Plot the reference PDF and overlay the target data before and after landmark registration. Parameters ---------- x: numpy.ndarray Target data ax: Matplotlib.Axes, optional Returns ------- Matplotlib.Axes """ ax = ax or plt.subplots(figsize=(5, 5))[1] shifted = self.shift_data(x) x = np.linspace(np.min(x) - 0.1, np.max(x) + 0.1, 10000) y2 = (FFTKDE(kernel="gaussian", bw="silverman") .fit(shifted) .evaluate(x)) self.original_functions.plot(axes=ax) ax.plot(x, y2) ax.legend(labels=["Before", "Ref", "After"]) return ax
# var_angles = [np.var(x, axis=1) for x in angles] # min_angles = [mean_angles[i] - var_angles[i] for i in range(len(mean_angles))] # max_angles = [mean_angles[i] + var_angles[i] for i in range(len(mean_angles))] # torques = [np.array(x[101:202,:]) for x in dfs] # mean_torques = [np.mean(x, axis=1) for x in torques] # var_torques = [np.var(x, axis=1) for x in torques] # min_torques = [mean_torques[i] - var_torques[i] for i in range(len(mean_torques))] # max_torques = [mean_torques[i] + var_torques[i] for i in range(len(mean_torques))] # plt.plot(angles[0]) # plt.show() # a0 = angles[0].T a0 = ka.T df = FDataGrid(a0) dataset = skfda.datasets.fetch_growth() # y = dataset['target'] # fd = dataset['data'] # df = dataset['data'] fd = copy.deepcopy(df) fd.plot() plt.show() ############################################################################## # FPCA can be done in two ways. The first way is to operate directly with the # raw data. We call it discretized FPCA as the functional data in this case # consists in finite values dispersed over points in a domain range. # We initialize and setup the FPCADiscretized object and run the fit method to # obtain the first two components. By default, if we do not specify the number
def _fit_grid(self, X: FDataGrid, y=None): r"""Computes the n_components first principal components and saves them. The eigenvalues associated with these principal components are also saved. For more details about how it is implemented please view the referenced book, chapter 8. In summary, we are performing standard multivariate PCA over :math:`\frac{1}{\sqrt{N}} \mathbf{X} \mathbf{W}^{1/2}` where :math:`N` is the number of samples in the dataset, :math:`\mathbf{X}` is the data matrix and :math:`\mathbf{W}` is the weight matrix (this matrix defines the numerical integration). By default the weight matrix is obtained using the trapezoidal rule. Args: X (FDataGrid): the functional data object to be analysed in basis representation y (None, not used): only present for convention of a fit function Returns: self (object) References: .. [RS05-8-4-1] Ramsay, J., Silverman, B. W. (2005). Discretizing the functions. In *Functional Data Analysis* (p. 161). Springer. """ # check that the number of components is smaller than the sample size if self.n_components > X.n_samples: raise AttributeError("The sample size must be bigger than the " "number of components") # check that we do not exceed limits for n_components as it should # be smaller than the number of attributes of the funcional data object if self.n_components > X.data_matrix.shape[1]: raise AttributeError("The number of components should be " "smaller than the number of discretization " "points of the functional data object.") # data matrix initialization fd_data = X.data_matrix.reshape(X.data_matrix.shape[:-1]) # get the number of samples and the number of points of descretization n_samples, n_points_discretization = fd_data.shape # if centering is True then subtract the mean function to each function # in FDataBasis X = self._center_if_necessary(X) # establish weights for each point of discretization if not self.weights: # sample_points is a list with one array in the 1D case # in trapezoidal rule, suppose \deltax_k = x_k - x_{k-1}, the weight # vector is as follows: [\deltax_1/2, \deltax_1/2 + \deltax_2/2, # \deltax_2/2 + \deltax_3/2, ... , \deltax_n/2] differences = np.diff(X.sample_points[0]) differences = np.concatenate(((0, ), differences, (0, ))) self.weights = (differences[:-1] + differences[1:]) / 2 elif callable(self.weights): self.weights = self.weights(X.sample_points[0]) # if its a FDataGrid then we need to reduce the dimension to 1-D # array if isinstance(self.weights, FDataGrid): self.weights = np.squeeze(self.weights.data_matrix) weights_matrix = np.diag(self.weights) basis = FDataGrid(data_matrix=np.identity(n_points_discretization), sample_points=X.sample_points) regularization_matrix = compute_penalty_matrix( basis_iterable=(basis, ), regularization_parameter=1, regularization=self.regularization) fd_data = np.transpose( np.linalg.solve( np.transpose(basis.data_matrix[..., 0] + regularization_matrix), np.transpose(fd_data))) # see docstring for more information final_matrix = fd_data @ np.sqrt(weights_matrix) / np.sqrt(n_samples) pca = PCA(n_components=self.n_components) pca.fit(final_matrix) self.components_ = X.copy(data_matrix=np.transpose( np.linalg.solve(np.sqrt(weights_matrix), np.transpose(pca.components_)))) self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ return self