def test_XX_weighted_meanconst(self): est = OnlineCovariance(compute_c0t=False, bessel=False) cc = est.fit(self.data - self.mean_const, weights=self.data_weights).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_c_wobj_lag0) np.testing.assert_allclose(cc.cov_00, self.Mxx_c_wobj_lag0) cc = est.fit(self.data - self.mean_const, weights=self.data_weights, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c_wobj_lag0[:, self.cols_2])
def test_XX_weightobj_meanfree(self): # many passes est = OnlineCovariance(compute_c0t=False, remove_data_mean=True, bessel=False) cc = est.fit(self.data, weights=self.data_weights, n_splits=10).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_wobj_lag0) np.testing.assert_allclose(cc.cov_00, self.Mxx0_wobj_lag0) cc = est.fit(self.data, column_selection=self.cols_2, weights=self.data_weights).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx0_wobj_lag0[:, self.cols_2])
def test_XX_with_mean(self): # many passes est = OnlineCovariance(compute_c0t=False, remove_data_mean=False, bessel=False) cc = est.fit(self.data).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_lag0) np.testing.assert_allclose(cc.cov_00, self.Mxx_lag0) cc = est.fit(self.data, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_lag0[:, self.cols_2])
def test_XY_sym_meanconst(self): est = OnlineCovariance(compute_c0t=True, reversible=True, bessel=False) cc = est.fit((self.X - self.mean_const, self.Y - self.mean_const)).fetch_model() np.testing.assert_allclose(cc.mean_0, self.m_c_sym) np.testing.assert_allclose(cc.cov_00, self.Mxx_c_sym) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_sym) cc = est.fit((self.X - self.mean_const, self.Y - self.mean_const), column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c_sym[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_sym[:, self.cols_2])
def test_XXXY_weightobj_sym_meanfree(self): # many passes est = OnlineCovariance(remove_data_mean=True, compute_c0t=True, reversible=True, bessel=False) cc = est.fit((self.X, self.Y), weights=self.data_weights_lagged).fetch_model() np.testing.assert_allclose(cc.mean_0, self.m_sym_wobj) np.testing.assert_allclose(cc.cov_00, self.Mxx0_sym_wobj) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_sym_wobj) cc = est.fit((self.X, self.Y), weights=self.data_weights_lagged, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx0_sym_wobj[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_sym_wobj[:, self.cols_2])
def test_XXXY_sym_withmean(self): # many passes est = OnlineCovariance(remove_data_mean=False, compute_c0t=True, reversible=True, bessel=False) cc = est.fit((self.X, self.Y)).fetch_model() np.testing.assert_allclose(cc.mean_0, self.m_sym) np.testing.assert_allclose(cc.cov_00, self.Mxx_sym) np.testing.assert_allclose(cc.cov_0t, self.Mxy_sym) cc = est.fit((self.X, self.Y), column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_sym[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_sym[:, self.cols_2])
def test_XX_meanconst(self): est = OnlineCovariance(lagtime=self.lag, compute_c0t=False, bessels_correction=False) cc = est.fit(self.data - self.mean_const).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_c_lag0) np.testing.assert_allclose(cc.cov_00, self.Mxx_c_lag0) cc = est.fit(self.data - self.mean_const, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c_lag0[:, self.cols_2])
def test_XXXY_meanfree(self): # many passes est = OnlineCovariance(remove_data_mean=True, compute_c0t=True, bessel=False) cc = est.fit((self.X, self.Y)).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx) np.testing.assert_allclose(cc.mean_t, self.my) np.testing.assert_allclose(cc.cov_00, self.Mxx0) np.testing.assert_allclose(cc.cov_0t, self.Mxy0) cc = est.fit((self.X, self.Y), column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx0[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy0[:, self.cols_2])
def test_XY_meanconst(self): est = OnlineCovariance(lagtime=self.lag, compute_c0t=True, bessels_correction=False) cc = est.fit(self.Xc_lag0).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_c) np.testing.assert_allclose(cc.mean_t, self.my_c) np.testing.assert_allclose(cc.cov_00, self.Mxx_c) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c) cc = est.fit(self.Xc_lag0, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c[:, self.cols_2])
def test_XY_weighted_meanconst(self): est = OnlineCovariance(compute_c0t=True, bessel=False) cc = est.fit((self.X - self.mean_const, self.Y - self.mean_const), weights=self.data_weights_lagged).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_c_wobj) np.testing.assert_allclose(cc.mean_t, self.my_c_wobj) np.testing.assert_allclose(cc.cov_00, self.Mxx_c_wobj) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_wobj) cc = est.fit((self.X - self.mean_const, self.Y - self.mean_const), weights=self.data_weights_lagged, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c_wobj[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_wobj[:, self.cols_2])
def test_XXXY_sym_meanfree(self): # many passes est = OnlineCovariance(lagtime=self.lag, remove_data_mean=True, compute_c0t=True, reversible=True, bessels_correction=False) cc = est.fit(self.data, lagtime=self.lag).fetch_model() np.testing.assert_allclose(cc.mean_0, self.m_sym) np.testing.assert_allclose(cc.cov_00, self.Mxx0_sym) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_sym) cc = est.fit(self.data, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx0_sym[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_sym[:, self.cols_2])
def test_XXXY_withmean(self): # many passes est = OnlineCovariance(lagtime=self.lag, remove_data_mean=False, compute_c0t=True, bessels_correction=False) cc = est.fit(self.data, n_splits=1).fetch_model() assert not cc.bessels_correction np.testing.assert_allclose(cc.mean_0, self.mx) np.testing.assert_allclose(cc.mean_t, self.my) np.testing.assert_allclose(cc.cov_00, self.Mxx) np.testing.assert_allclose(cc.cov_0t, self.Mxy) cc = est.fit(self.data, n_splits=1, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy[:, self.cols_2])
def test_XXXY_weightobj_withmean(self): # many passes est = OnlineCovariance(lagtime=self.lag, remove_data_mean=False, compute_c0t=True, bessels_correction=False) cc = est.fit(self.data, weights=self.data_weights).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_wobj) np.testing.assert_allclose(cc.mean_t, self.my_wobj) np.testing.assert_allclose(cc.cov_00, self.Mxx_wobj) np.testing.assert_allclose(cc.cov_0t, self.Mxy_wobj) cc = est.fit(self.data, weights=self.data_weights, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_wobj[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_wobj[:, self.cols_2])
def test_XY_sym_weighted_meanconst(self): est = OnlineCovariance(lagtime=self.lag, compute_c0t=True, reversible=True, bessels_correction=False) cc = est.fit(self.Xc_lag0, n_splits=1, weights=self.data_weights).fetch_model() np.testing.assert_allclose(cc.mean_0, self.m_c_sym_wobj) np.testing.assert_allclose(cc.cov_00, self.Mxx_c_sym_wobj) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_sym_wobj) cc = est.fit(self.Xc_lag0, weights=self.data_weights, n_splits=1, column_selection=self.cols_2).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx_c_sym_wobj[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy_c_sym_wobj[:, self.cols_2])
def test_XXXY_weightobj_meanfree(self): #TODO: tests do not pass for n_splits > 1! # many passes est = OnlineCovariance(lagtime=self.lag, remove_data_mean=True, compute_c0t=True, bessels_correction=False) cc = est.fit(self.data, weights=self.data_weights, n_splits=1).fetch_model() np.testing.assert_allclose(cc.mean_0, self.mx_wobj) np.testing.assert_allclose(cc.mean_t, self.my_wobj) np.testing.assert_allclose(cc.cov_00, self.Mxx0_wobj) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_wobj) cc = est.fit(self.data, weights=self.data_weights, column_selection=self.cols_2, n_splits=1).fetch_model() np.testing.assert_allclose(cc.cov_00, self.Mxx0_wobj[:, self.cols_2]) np.testing.assert_allclose(cc.cov_0t, self.Mxy0_wobj[:, self.cols_2])
def test_re_estimate_weight_types(self): # check different types are allowed and re-estimation works x = np.random.random((100, 2)) c = OnlineCovariance(lagtime=1, compute_c0t=True) c.fit(x, weights=np.ones((len(x), ))).fetch_model() c.fit(x, weights=np.ones((len(x), ))).fetch_model() c.fit(x, weights=None).fetch_model() c.fit(x, weights=x[:, 0]).fetch_model()
assert np.all( model_copy.mean_0 == model.mean_0) and model_copy is not model ################################################################################################ # compute covariance matrix C0t ################################################################################################ tau = 10 # configure estimator with estimator-global parameters estimator = OnlineCovariance(compute_c00=False, compute_c0t=True, remove_data_mean=True) # do one-shot estimation by giving tuple of instantaneous and time-shifted data (also possible with partial fit) estimator.fit((data[:-tau], data[tau:])) # finalize and retrieve model model = estimator.fetch_model() print(model.cov_0t) ################################################################################################ # outlook for transformers ################################################################################################ # calls transform method on model # estimator.transform(data) # ... and is equivalent to
class TICA(Estimator, Transformer): r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_. Parameters ---------- lagtime : int the time of the lag dim : int or float, optional, default 0.95 Number of dimensions (independent components) to project onto. * if dim is not set (None) all available ranks are kept: `n_components == min(n_samples, n_uncorrelated_features)` * if dim is an integer >= 1, this number specifies the number of dimensions to keep. * if dim is a float with ``0 < dim <= 1``, select the number of dimensions such that the amount of kinetic variance that needs to be explained is greater than the percentage specified by dim. epsilon : float eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be cut off. The remaining number of eigenvalues define the size of the output. reversible: bool, default=True symmetrize correlation matrices C_0, C_{\tau}. scaling: str or None, default='kinetic_map' * None: unscaled. * 'kinetic_map': Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering. * 'commute_map': Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed data will approximate commute distances [5]_. Notes ----- Given a sequence of multivariate data :math:`X_t`, computes the mean-free covariance and time-lagged covariance matrix: .. math:: C_0 &= (X_t - \mu)^T (X_t - \mu) \\ C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu) and solves the eigenvalue problem .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i, where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are their respective normalized time-autocorrelations. The eigenvalues are related to the relaxation timescale by .. math:: t_i(tau) = -\tau / \ln |\lambda_i|. When used as a dimension reduction method, the input data is projected onto the dominant independent components. References ---------- .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013. Identification of slow molecular order parameters for Markov model construction J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489 .. [2] Schwantes C, V S Pande. 2013. Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9 J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a .. [3] L. Molgedey and H. G. Schuster. 1994. Separation of a mixture of independent signals using time delayed correlations Phys. Rev. Lett. 72, 3634. .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762 """ def __init__(self, lagtime: int, epsilon=1e-6, reversible=True, dim=0.95, scaling='kinetic_map', ncov=5): super(TICA, self).__init__() # tica parameters self._model.epsilon = epsilon self._model.dim = dim self._model.scaling = scaling self._model.lagtime = lagtime # online cov parameters self.reversible = reversible self._covar = OnlineCovariance(compute_c00=True, compute_c0t=True, compute_ctt=False, remove_data_mean=True, reversible=self.reversible, bessel=False, ncov=ncov) def _create_model(self) -> TICAModel: return TICAModel() def transform(self, data): r"""Projects the data onto the dominant independent components. Parameters ---------- data : ndarray(n, m) the input data Returns ------- Y : ndarray(n,) the projected data """ return self.fetch_model().transform(data) def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays input data. """ self._covar.partial_fit(X) return self def fit(self, X, **kw): self._covar.fit(X, **kw) return self def fetch_model(self) -> TICAModel: covar_model = self._covar.fetch_model() self._model.cov_00 = covar_model.cov_00 self._model.cov_0t = covar_model.cov_0t self._model.mean_0 = covar_model.mean_0 return self._model
class VAMP(Estimator): r"""Variational approach for Markov processes (VAMP)""" def __init__(self, lagtime=1, dim=None, scaling=None, right=False, epsilon=1e-6, ncov=float('inf')): r""" Variational approach for Markov processes (VAMP) [1]_. Parameters ---------- dim : float or int, default=None Number of dimensions to keep: * if dim is not set (None) all available ranks are kept: `n_components == min(n_samples, n_uncorrelated_features)` * if dim is an integer >= 1, this number specifies the number of dimensions to keep. * if dim is a float with ``0 < dim < 1``, select the number of dimensions such that the amount of kinetic variance that needs to be explained is greater than the percentage specified by dim. scaling : None or string Scaling to be applied to the VAMP order parameters upon transformation * None: no scaling will be applied, variance of the order parameters is 1 * 'kinetic map' or 'km': order parameters are scaled by singular value. Only the left singular functions induce a kinetic map wrt the conventional forward propagator. The right singular functions induce a kinetic map wrt the backward propagator. right : boolean Whether to compute the right singular functions. If `right==True`, `get_output()` will return the right singular functions. Otherwise, `get_output()` will return the left singular functions. Beware that only `frames[tau:, :]` of each trajectory returned by `get_output()` contain valid values of the right singular functions. Conversely, only `frames[0:-tau, :]` of each trajectory returned by `get_output()` contain valid values of the left singular functions. The remaining frames might possibly be interpreted as some extrapolation. epsilon : float eigenvalue cutoff. Eigenvalues of :math:`C_{00}` and :math:`C_{11}` with norms <= epsilon will be cut off. The remaining number of eigenvalues together with the value of `dim` define the size of the output. ncov : int, default=infinity limit the memory usage of the algorithm from [3]_ to an amount that corresponds to ncov additional copies of each correlation matrix Notes ----- VAMP is a method for dimensionality reduction of Markov processes. The Koopman operator :math:`\mathcal{K}` is an integral operator that describes conditional future expectation values. Let :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability density of visiting an infinitesimal phase space volume around point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase space point :math:`\mathbf{x}` was visited at the earlier time :math:`t`. Then the action of the Koopman operator on a function :math:`f` can be written as follows: .. math:: \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right] The Koopman operator is defined without any reference to an equilibrium distribution. Therefore it is well-defined in situations where the dynamics is irreversible or/and non-stationary such that no equilibrium distribution exists. If we approximate :math:`f` by a linear superposition of ansatz functions :math:`\boldsymbol{\chi}` of the conformational degrees of freedom (features), the operator :math:`\mathcal{K}` can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`. The approximation is computed as follows: From the time-dependent input features :math:`\boldsymbol{\chi}(t)`, we compute the mean :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from all data excluding the last (first) :math:`\tau` steps of every trajectory as follows: .. math:: \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t) \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t) Next, we compute the instantaneous covariance matrices :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows: .. math:: \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right] The Koopman matrix is then computed as follows: .. math:: \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01} It can be shown [1]_ that the leading singular functions of the half-weighted Koopman matrix .. math:: \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}} encode the best reduced dynamical model for the time series. The singular functions can be computed by first performing the singular value decomposition .. math:: \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime} and then mapping the input conformation to the left singular functions :math:`\boldsymbol{\psi}` and right singular functions :math:`\boldsymbol{\phi}` as follows: .. math:: \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwise algorithms for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. """ self.dim = dim self.scaling = scaling self.right = right self.epsilon = epsilon self.ncov = ncov self._covar = OnlineCovariance(lagtime=lagtime, compute_c00=True, compute_c0t=True, compute_ctt=True, remove_data_mean=True, reversible=False, bessels_correction=False, ncov=self.ncov) self.lagtime = lagtime super(VAMP, self).__init__() def fit(self, data, **kw): self._model = VAMPModel(dim=self.dim, epsilon=self.epsilon, scaling=self.scaling, right=self.right) self._covar.fit(data, **kw) self.fetch_model() return self def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays, PyEMMA reader input data. Notes ----- The projection matrix is first being calculated upon its first access. """ if self._model is None: self._model = VAMPModel(dim=self.dim, epsilon=self.epsilon, scaling=self.scaling, right=self.right) self._covar.partial_fit(X) return self def fetch_model(self) -> VAMPModel: covar_model = self._covar.fetch_model() self._model.cov_00 = covar_model.cov_00 self._model.cov_0t = covar_model.cov_0t self._model.cov_tt = covar_model.cov_tt self._model.mean_0 = covar_model.mean_0 self._model.mean_t = covar_model.mean_t self._model._diagonalize() return self._model @property def lagtime(self): return self._covar.lagtime @lagtime.setter def lagtime(self, value): self._covar.lagtime = value