def _estimate(self, iterable, **kw): covar = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=self.reversible, lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=self.weights, ncov_max=self.ncov_max) indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) if self._logger_is_active(self._loglevel_DEBUG): self.logger.debug( "Running TICA with tau=%i; Estimating two covariance matrices" " with dimension (%i, %i)", self._lag, indim, indim) covar.estimate(iterable, chunksize=self.chunksize, **kw) self.model.update_model_params(mean=covar.mean, cov=covar.C00_, cov_tau=covar.C0t_) self._diagonalize() return self.model
class TICA(StreamingEstimationTransformer): r""" Time-lagged independent component analysis (TICA)""" def __init__(self, lag, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_map=False, epsilon=1e-6, stride=1, skip=0, reversible=True, weights=None, ncov_max=float('inf')): r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_. Parameters ---------- lag : int lag time dim : int, optional, default -1 Maximum number of significant independent components to use to reduce dimension of input data. -1 means all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff. Setting dim to a positive value is exclusive with var_cutoff. var_cutoff : float in the range [0,1], optional, default 0.95 Determines the number of output dimensions by including dimensions until their cumulative kinetic variance exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim kinetic_map : bool, optional, default True Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering. commute_map : bool, optional, default False Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed data will approximate commute distances [5]_. epsilon : float eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be cut off. The remaining number of eigenvalues define the size of the output. stride: int, optional, default = 1 Use only every stride-th time step. By default, every time step is used. skip : int, default=0 skip the first initial n frames per trajectory. reversible: bool, default=True symmetrize correlation matrices C_0, C_{\tau}. At the moment, setting reversible=False is not implemented. weights: object, optional, default = None An object that allows to compute re-weighting factors to estimate equilibrium means and correlations from off-equilibrium data. The only requirement is that weights possesses a method weights(X), that accepts a trajectory X (np.ndarray(T, n)) and returns a vector of re-weighting factors (np.ndarray(T,)). Notes ----- Given a sequence of multivariate data :math:`X_t`, computes the mean-free covariance and time-lagged covariance matrix: .. math:: C_0 &= (X_t - \mu)^T (X_t - \mu) \\ C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu) and solves the eigenvalue problem .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i, where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are their respective normalized time-autocorrelations. The eigenvalues are related to the relaxation timescale by .. math:: t_i(tau) = -\tau / \ln |\lambda_i|. When used as a dimension reduction method, the input data is projected onto the dominant independent components. References ---------- .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013. Identification of slow molecular order parameters for Markov model construction J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489 .. [2] Schwantes C, V S Pande. 2013. Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9 J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a .. [3] L. Molgedey and H. G. Schuster. 1994. Separation of a mixture of independent signals using time delayed correlations Phys. Rev. Lett. 72, 3634. .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762 """ default_var_cutoff = get_default_args(self.__init__)['var_cutoff'] if dim != -1 and var_cutoff != default_var_cutoff: raise ValueError( 'Trying to set both the number of dimension and the subspace variance. Use either or.' ) if kinetic_map and commute_map: raise ValueError( 'Trying to use both kinetic_map and commute_map. Use either or.' ) if not reversible: raise NotImplementedError( "Reversible=False is currently not implemented.") # if (kinetic_map or commute_map) and not reversible: # raise NotImplementedError('kinetic_map and commute_map are not yet implemented for irreversible processes.') super(TICA, self).__init__() if dim > -1: var_cutoff = 1.0 self._covar = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible, lag=lag, bessel=False, stride=stride, skip=skip, weights=weights, ncov_max=ncov_max) # empty dummy model instance self._model = TICAModel() self.set_params(lag=lag, dim=dim, var_cutoff=var_cutoff, kinetic_map=kinetic_map, commute_map=commute_map, epsilon=epsilon, reversible=reversible, stride=stride, skip=skip, weights=weights, ncov_max=ncov_max) @property def lag(self): """ lag time of correlation matrix :math:`C_{\tau}` """ return self._lag @lag.setter def lag(self, new_tau): self._lag = new_tau def describe(self): try: dim = self.dimension() except AttributeError: dim = self.dim return "[TICA, lag = %i; max. output dim. = %i]" % (self._lag, dim) def dimension(self): """ output dimension """ if self.dim > -1: return self.dim d = None if self.dim != -1 and not self._estimated: # fixed parametrization d = self.dim elif self._estimated: # parametrization finished. Dimension is known dim = len(self.eigenvalues) if self.var_cutoff < 1.0: # if subspace_variance, reduce the output dimension if needed dim = min(dim, np.searchsorted(self.cumvar, self.var_cutoff) + 1) d = dim elif self.var_cutoff == 1.0: # We only know that all dimensions are wanted, so return input dim d = self.data_producer.dimension() else: # We know nothing. Give up raise RuntimeError( 'Requested dimension, but the dimension depends on the cumulative variance and the ' 'transformer has not yet been estimated. Call estimate() before.' ) return d @property def mean(self): """ mean of input features """ return self._model.mean @property @deprecated('please use the "mean" property') def mu(self): """DEPRECATED: please use the "mean" property""" return self.mean @mean.setter def mean(self, value): self._model.mean = value def estimate(self, X, **kwargs): r""" Chunk-based parameterization of TICA. Iterates over all data and estimates the mean, covariance and time lagged covariance. Finally, the generalized eigenvalue problem is solved to determine the independent components. """ return super(TICA, self).estimate(X, **kwargs) def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays, PyEMMA reader input data. Notes ----- The projection matrix is first being calculated upon its first access. """ from pyemma.coordinates import source iterable = source(X) indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) self._covar.partial_fit(iterable) self._model.update_model_params( mean=self._covar.mean, # TODO: inefficient, fixme cov=self._covar.cov, cov_tau=self._covar.cov_tau) self._used_data = self._covar._used_data self._estimated = False return self def _estimate(self, iterable, **kw): indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) if self._logger_is_active(self._loglevel_DEBUG): self._logger.debug( "Running TICA with tau=%i; Estimating two covariance matrices" " with dimension (%i, %i)" % (self._lag, indim, indim)) self._covar.estimate(iterable, **kw) self._model.update_model_params(mean=self._covar.mean, cov=self._covar.cov, cov_tau=self._covar.cov_tau) self._diagonalize() return self._model def _transform_array(self, X): r"""Projects the data onto the dominant independent components. Parameters ---------- X : ndarray(n, m) the input data Returns ------- Y : ndarray(n,) the projected data """ X_meanfree = X - self.mean Y = np.dot(X_meanfree, self.eigenvectors[:, 0:self.dimension()]) return Y.astype(self.output_type()) def _diagonalize(self): # diagonalize with low rank approximation self._logger.debug("diagonalize Cov and Cov_tau.") try: eigenvalues, eigenvectors = eig_corr(self._covar.cov, self._covar.cov_tau, self.epsilon, sign_maxelement=True) except ZeroRankError: raise ZeroRankError( 'All input features are constant in all time steps. No dimension would be left after dimension reduction.' ) if self.kinetic_map and self.commute_map: raise ValueError( 'Trying to use both kinetic_map and commute_map. Use either or.' ) if self.kinetic_map: # scale by eigenvalues eigenvectors *= eigenvalues[None, :] if self.commute_map: # scale by (regularized) timescales timescales = 1 - self.lag / np.log(np.abs(eigenvalues)) # dampen timescales smaller than the lag time, as in section 2.5 of ref. [5] regularized_timescales = 0.5 * timescales * np.maximum( np.tanh(np.pi * ((timescales - self.lag) / self.lag) + 1), 0) eigenvectors *= np.sqrt(regularized_timescales / 2) self._logger.debug("finished diagonalisation.") # compute cumulative variance cumvar = np.cumsum(np.abs(eigenvalues)**2) cumvar /= cumvar[-1] self._model.update_model_params(cumvar=cumvar, eigenvalues=eigenvalues, eigenvectors=eigenvectors) self._estimated = True @property @_lazy_estimation def timescales(self): r"""Implied timescales of the TICA transformation For each :math:`i`-th eigenvalue, this returns .. math:: t_i = -\frac{\tau}{\log(|\lambda_i|)} where :math:`\tau` is the :py:obj:`lag` of the TICA object and :math:`\lambda_i` is the `i`-th :py:obj:`eigenvalue <eigenvalues>` of the TICA object. Returns ------- timescales: 1D np.array numpy array with the implied timescales. In principle, one should expect as many timescales as input coordinates were available. However, less eigenvalues will be returned if the TICA matrices were not full rank or :py:obj:`var_cutoff` was parsed """ return -self.lag / np.log(np.abs(self.eigenvalues)) @property @_lazy_estimation def feature_TIC_correlation(self): r"""Instantaneous correlation matrix between mean-free input features and TICs Denoting the input features as :math:`X_i` and the TICs as :math:`\theta_j`, the instantaneous, linear correlation between them can be written as .. math:: \mathbf{Corr}(X_i - \mu_i, \mathbf{\theta}_j) = \frac{1}{\sigma_{X_i - \mu_i}}\sum_l \sigma_{(X_i - \mu_i)(X_l - \mu_l} \mathbf{U}_{li} The matrix :math:`\mathbf{U}` is the matrix containing, as column vectors, the eigenvectors of the TICA generalized eigenvalue problem . Returns ------- feature_TIC_correlation : ndarray(n,m) correlation matrix between input features and TICs. There is a row for each feature and a column for each TIC. """ feature_sigma = np.sqrt(np.diag(self.cov)) return np.dot(self.cov, self.eigenvectors[:, :self.dimension()] ) / feature_sigma[:, np.newaxis] @property def cov(self): """ covariance matrix of input data. """ return self._model.cov @cov.setter def cov(self, value): self._model.cov = value @property def cov_tau(self): """ covariance matrix of time-lagged input data. """ return self._model.cov_tau @cov_tau.setter def cov_tau(self, value): self._model.cov_tau = value @property @_lazy_estimation def eigenvalues(self): r"""Eigenvalues of the TICA problem (usually denoted :math:`\lambda` Returns ------- eigenvalues: 1D np.array """ return self._model.eigenvalues @property @_lazy_estimation def eigenvectors(self): r"""Eigenvectors of the TICA problem, columnwise Returns ------- eigenvectors: (N,M) ndarray """ return self._model.eigenvectors @property @_lazy_estimation def cumvar(self): r"""Cumulative sum of the the TICA eigenvalues Returns ------- cumvar: 1D np.array """ return self._model.cumvar def output_type(self): # TODO: handle the case of conjugate pairs if np.all(np.isreal(self.eigenvectors[:, 0:self.dimension()])) or \ np.allclose(np.imag(self.eigenvectors[:, 0:self.dimension()]), 0): return super(TICA, self).output_type() else: return np.complex64
class _KoopmanEstimator(StreamingEstimator): '''only for computing u The user-accessible way for computing K is TICA() ''' def __init__(self, lag, epsilon=1e-6, stride=1, skip=0, chunksize=None, ncov_max=float('inf')): super(_KoopmanEstimator, self).__init__(chunksize=chunksize) self._covar = LaggedCovariance(c00=True, c0t=True, remove_data_mean=True, reversible=False, lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max) self.set_params(lag=lag, epsilon=epsilon, stride=stride, skip=skip, ncov_max=ncov_max) self._estimation_finished = False def partial_fit(self, X): from pyemma.coordinates import source self._covar.partial_fit(source(X)) self._estimation_finished = False self._estimated = True return self def _finish_estimation(self): R = spd_inv_split(self._covar.cov, epsilon=self.epsilon, canonical_signs=True) # Set the new correlation matrix: M = R.shape[1] K = np.dot(R.T, np.dot((self._covar.cov_tau), R)) K = np.vstack((K, np.dot((self._covar.mean_tau - self._covar.mean), R))) ex1 = np.zeros((M + 1, 1)) ex1[M, 0] = 1.0 self._K = np.hstack((K, ex1)) self._R = R self._estimation_finished = True self._estimated = True def _estimate(self, iterable, **kwargs): self._covar.estimate(iterable, **kwargs) self._finish_estimation() return self @property def K_pc_1(self): 'Koopman operator on the modified basis (PC|1)' self._check_estimated() if not self._estimation_finished: self._finish_estimation() return self._K @property def u_pc_1(self): 'weights in the modified basis' self._check_estimated() return _compute_u(self.K_pc_1) @property def u(self): 'weights in the input basis' self._check_estimated() u_mod = self.u_pc_1 N = self._R.shape[0] u_input = np.zeros(N + 1) u_input[0:N] = self._R.dot(u_mod[0:-1]) # in input basis u_input[N] = u_mod[-1] - self.mean.dot(self._R.dot(u_mod[0:-1])) return u_input @property def weights(self): 'weights in the input basis (encapsulated in an object)' self._check_estimated() u_input = self.u return _KoopmanWeights(u_input[0:-1], u_input[-1]) @property def R(self): 'weightening transformation' self._check_estimated() if not self._estimation_finished: self._finish_estimation() return self._R @property def mean(self): self._check_estimated() return self._covar.mean
class NystroemTICA(TICABase, SerializableMixIn): r""" Sparse sampling implementation of time-lagged independent component analysis (TICA)""" __serialize_version = 0 __serialize_fields = () def __init__(self, lag, max_columns, dim=-1, var_cutoff=TICABase._DEFAULT_VARIANCE_CUTOFF, epsilon=1e-6, stride=1, skip=0, reversible=True, ncov_max=float('inf'), initial_columns=None, nsel=1, selection_strategy='spectral-oasis', neig=None): r""" Sparse sampling implementation [1]_ of time-lagged independent component analysis (TICA) [2]_, [3]_, [4]_. Parameters ---------- lag : int lag time max_columns : int Maximum number of columns (features) to use in the approximation. dim : int, optional, default -1 Maximum number of significant independent components to use to reduce dimension of input data. -1 means all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff. Setting dim to a positive value is exclusive with var_cutoff. var_cutoff : float in the range [0,1], optional, default 0.95 Determines the number of output dimensions by including dimensions until their cumulative kinetic variance exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim. epsilon : float, optional, default 1e-6 Eigenvalue norm cutoff. Eigenvalues of :math:`C_0` with norms <= epsilon will be cut off. The remaining number of eigenvalues define the size of the output. stride: int, optional, default 1 Use only every stride-th time step. By default, every time step is used. skip : int, optional, default 0 Skip the first initial n frames per trajectory. reversible: bool, optional, default True Symmetrize correlation matrices :math:`C_0`, :math:`C_{\tau}`. initial_columns : list, ndarray(k, dtype=int), int, or None, optional, default None Columns used for an initial approximation. If a list or an 1-d ndarray of integers is given, use these column indices. If an integer is given, use that number of randomly selected indices. If None is given, use one randomly selected column. nsel : int, optional, default 1 Number of columns to select and add per iteration and pass through the data. Larger values provide for better pass-efficiency. selection_strategy : str, optional, default 'spectral-oasis' Strategy to use for selecting new columns for the approximation. Can be 'random', 'oasis' or 'spectral-oasis'. neig : int or None, optional, default None Number of eigenvalues to be optimized by the selection process. If None, use the whole available eigenspace Notes ----- Perform a sparse approximation of time-lagged independent component analysis (TICA) :class:`TICA <pyemma.coordinates.transform.TICA>`. The starting point is the generalized eigenvalue problem .. math:: C_{\tau} r_i = C_0 \lambda_i(\tau) r_i. Instead of computing the full matrices involved in this problem, we conduct a Nyström approximation [5]_ of the matrix :math:`C_0` by means of the accelerated sequential incoherence selection (oASIS) algorithm [6]_ and, in particular, its extension called spectral oASIS [1]_. Iteratively, we select a small number of columns such that the resulting Nyström approximation is sufficiently accurate. This selection represents in turn a subset of important features, for which we obtain a generalized eigenvalue problem similar to the one above, but much smaller in size. Its generalized eigenvalues and eigenvectors provide an approximation to those of the full TICA solution [1]_. References ---------- .. [1] F. Litzinger, L. Boninsegna, H. Wu, F. Nüske, R. Patel, R. Baraniuk, F. Noé, and C. Clementi. Rapid calculation of molecular kinetics using compressed sensing (2018). (submitted) .. [2] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013. Identification of slow molecular order parameters for Markov model construction J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489 .. [3] Schwantes C, V S Pande. 2013. Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9 J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a .. [4] L. Molgedey and H. G. Schuster. 1994. Separation of a mixture of independent signals using time delayed correlations Phys. Rev. Lett. 72, 3634. .. [5] P. Drineas and M. W. Mahoney. On the Nystrom method for approximating a Gram matrix for improved kernel-based learning. Journal of Machine Learning Research, 6:2153-2175 (2005). .. [6] Raajen Patel, Thomas A. Goldstein, Eva L. Dyer, Azalia Mirhoseini, Richard G. Baraniuk. oASIS: Adaptive Column Sampling for Kernel Matrix Approximation. arXiv: 1505.05208 [stat.ML]. """ super(NystroemTICA, self).__init__() self._covar = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible, lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max) self._diag = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible, lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max, diag_only=True) self._oasis = None self.dim = dim self.var_cutoff = var_cutoff self.set_params(lag=lag, max_columns=max_columns, epsilon=epsilon, reversible=reversible, stride=stride, skip=skip, ncov_max=ncov_max, initial_columns=initial_columns, nsel=nsel, selection_strategy=selection_strategy, neig=neig) @property def model(self): if not hasattr(self, '_model') or self._model is None: self._model = NystroemTICAModel() return self._model @property def initial_columns(self): return self._initial_columns @initial_columns.setter def initial_columns(self, initial_columns): if not (initial_columns is None or isinstance(initial_columns, (int, FunctionType, np.ndarray))): raise ValueError('initial_columns has to be one of these types (None, int, function, ndarray),' 'but was {}'.format(type(initial_columns))) if initial_columns is None: initial_columns = 1 if isinstance(initial_columns, int): i = initial_columns initial_columns = lambda N: np.random.choice(N, i, replace=False) if isinstance(initial_columns, np.ndarray): initial_columns = ensure_int_vector(initial_columns) self._initial_columns = initial_columns def describe(self): try: dim = self.dimension() except RuntimeError: dim = self.dim return "[NystroemTICA, lag = %i; max. columns = %i; max. output dim. = %i]" % (self.lag, self.max_columns, dim) def estimate(self, X, **kwargs): r""" Chunk-based parameterization of NystroemTICA. Iterates over all data several times to select important columns and estimate the mean, covariance and time-lagged covariance. Finally, the small-scale generalized eigenvalue problem is solved to determine the approximate independent components. """ return super(NystroemTICA, self).estimate(X, **kwargs) def _estimate(self, iterable, **kw): from pyemma.coordinates.data import DataInMemory if not isinstance(iterable, DataInMemory): self.logger.warning('Every iteration of the selection process involves streaming of all data and featurization. ' 'Depending on your setup, this might be inefficient.') indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError("requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) if callable(self.initial_columns): self.initial_columns = self.initial_columns(indim) if not len(np.array(self.initial_columns).shape) == 1: raise ValueError('initial_columns must be either None, an integer, a list, or a 1-d numpy array.') self._diag.estimate(iterable, **kw) self._covar.column_selection = self.initial_columns self._covar.estimate(iterable, **kw) self.model.update_model_params(cov_tau=self._covar.C0t_) self._oasis = oASIS_Nystroem(self._diag.C00_, self._covar.C00_, self.initial_columns) self._oasis.set_selection_strategy(strategy=self.selection_strategy, nsel=self.nsel, neig=self.neig) while self._oasis.k < np.min((self.max_columns, self._oasis.n)): cols = self._oasis.select_columns() if cols is None or len(cols) == 0 or np.all(np.in1d(cols, self._oasis.column_indices)): self.logger.warning("Iteration ended prematurely: No more columns to select.") break self._covar.column_selection = cols self._covar.estimate(iterable, **kw) ix = self._oasis.add_columns(self._covar.C00_, cols) ix = np.in1d(cols, ix) if np.any(ix): added_columns = self._covar.C0t_[:, ix] self.model.update_model_params(cov_tau=np.concatenate((self._model.cov_tau, added_columns), axis=1)) self.model.update_model_params(mean=self._covar.mean, diag=self._diag.C00_, cov=self._oasis.Ck, column_indices=self._oasis.column_indices) self._diagonalize() return self.model def _diagonalize(self): # diagonalize with low rank approximation self.logger.debug("Diagonalize Cov and Cov_tau.") Wktau = self._model.cov_tau[self._model.column_indices, :] try: eigenvalues, eigenvectors = eig_corr(self._oasis.Wk, Wktau, self.epsilon, sign_maxelement=True) except ZeroRankError: raise ZeroRankError('All input features are constant in all time steps. ' 'No dimension would be left after dimension reduction.') self.logger.debug("Finished diagonalization.") # compute cumulative variance cumvar = np.cumsum(np.abs(eigenvalues) ** 2) cumvar /= cumvar[-1] self._model.update_model_params(cumvar=cumvar, eigenvalues=eigenvalues, eigenvectors=eigenvectors) self._estimated = True @property def column_indices(self): """ Indices of columns used in the approximation. """ return self.model.column_indices
class VAMP(StreamingEstimationTransformer, SerializableMixIn): r"""Variational approach for Markov processes (VAMP)""" __serialize_version = 0 __serialize_fields = [] def describe(self): return "[VAMP, lag = %i; max. output dim. = %s]" % (self._lag, str(self.dim)) def __init__(self, lag, dim=None, scaling=None, right=True, epsilon=1e-6, stride=1, skip=0, ncov_max=float('inf')): r""" Variational approach for Markov processes (VAMP) [1]_. Parameters ---------- lag : int lag time dim : float or int, default=None Number of dimensions to keep: * if dim is not set (None) all available ranks are kept: `n_components == min(n_samples, n_features)` * if dim is an integer >= 1, this number specifies the number of dimensions to keep. * if dim is a float with ``0 < dim < 1``, select the number of dimensions such that the amount of kinetic variance that needs to be explained is greater than the percentage specified by dim. scaling : None or string Scaling to be applied to the VAMP order parameters upon transformation * None: no scaling will be applied, variance of the order parameters is 1 * 'kinetic map' or 'km': order parameters are scaled by singular value Only the left singular functions induce a kinetic map. Therefore scaling='km' is only effective if `right` is False. right : boolean Whether to compute the right singular functions. If `right==True`, `get_output()` will return the right singular functions. Otherwise, `get_output()` will return the left singular functions. Beware that only `frames[tau:, :]` of each trajectory returned by `get_output()` contain valid values of the right singular functions. Conversely, only `frames[0:-tau, :]` of each trajectory returned by `get_output()` contain valid values of the left singular functions. The remaining frames might possibly be interpreted as some extrapolation. epsilon : float singular value cutoff. Singular values of :math:`C0` with norms <= epsilon will be cut off. The remaining number of singular values define the size of the output. stride: int, optional, default = 1 Use only every stride-th time step. By default, every time step is used. skip : int, default=0 skip the first initial n frames per trajectory. ncov_max : int, default=infinity limit the memory usage of the algorithm from [3]_ to an amount that corresponds to ncov_max additional copies of each correlation matrix Notes ----- VAMP is a method for dimensionality reduction of Markov processes. The Koopman operator :math:`\mathcal{K}` is an integral operator that describes conditional future expectation values. Let :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability density of visiting an infinitesimal phase space volume around point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase space point :math:`\mathbf{x}` was visited at the earlier time :math:`t`. Then the action of the Koopman operator on a function :math:`f` can be written as follows: .. math:: \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right] The Koopman operator is defined without any reference to an equilibrium distribution. Therefore it is well-defined in situations where the dynamics is irreversible or/and non-stationary such that no equilibrium distribution exists. If we approximate :math:`f` by a linear superposition of ansatz functions :math:`\boldsymbol{\chi}` of the conformational degrees of freedom (features), the operator :math:`\mathcal{K}` can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`. The approximation is computed as follows: From the time-dependent input features :math:`\boldsymbol{\chi}(t)`, we compute the mean :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from all data excluding the last (first) :math:`\tau` steps of every trajectory as follows: .. math:: \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t) \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t) Next, we compute the instantaneous covariance matrices :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows: .. math:: \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right] The Koopman matrix is then computed as follows: .. math:: \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01} It can be shown [1]_ that the leading singular functions of the half-weighted Koopman matrix .. math:: \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}} encode the best reduced dynamical model for the time series. The singular functions can be computed by first performing the singular value decomposition .. math:: \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime} and then mapping the input conformation to the left singular functions :math:`\boldsymbol{\psi}` and right singular functions :math:`\boldsymbol{\phi}` as follows: .. math:: \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. """ StreamingEstimationTransformer.__init__(self) # empty dummy model instance self._model = VAMPModel() self.set_params(lag=lag, dim=dim, scaling=scaling, right=right, epsilon=epsilon, stride=stride, skip=skip, ncov_max=ncov_max) self._covar = None self._model.update_model_params(dim=dim, epsilon=epsilon, scaling=scaling) def _estimate(self, iterable, **kw): self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False, lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None, ncov_max=self.ncov_max) indim = iterable.dimension() if isinstance(self.dim, int) and not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) if self._logger_is_active(self._loglevel_DEBUG): self.logger.debug( "Running VAMP with tau=%i; Estimating two covariance matrices" " with dimension (%i, %i)" % (self._lag, indim, indim)) self._covar.estimate(iterable, **kw) self._model.update_model_params(mean_0=self._covar.mean, mean_t=self._covar.mean_tau, C00=self._covar.C00_, C0t=self._covar.C0t_, Ctt=self._covar.Ctt_) self.model._diagonalize() # if the previous estimation was a partial_fit, we might have a running covar object, which we can safely omit now. if '_covar' in self.__serialize_fields: self.__serialize_fields.remove('_covar') return self._model def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays, PyEMMA reader input data. Notes ----- The projection matrix is first being calculated upon its first access. """ from pyemma.coordinates import source iterable = source(X) if isinstance(self.dim, int): indim = iterable.dimension() if not self.dim <= indim: raise RuntimeError( "requested more output dimensions (%i) than dimension" " of input data (%i)" % (self.dim, indim)) if self._covar is None: self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False, lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None, ncov_max=self.ncov_max) self._covar.partial_fit(iterable) self._model.update_model_params( mean_0=self._covar.mean, # TODO: inefficient, fixme mean_t=self._covar.mean_tau, C00=self._covar.C00_, C0t=self._covar.C0t_, Ctt=self._covar.Ctt_) self._estimated = False if '_covar' not in self.__serialize_fields: self.__serialize_fields.append('_covar') return self def dimension(self): return self._model.dimension() def _transform_array(self, X): r"""Projects the data onto the dominant singular functions. Parameters ---------- X : ndarray(n, m) the input data Returns ------- Y : ndarray(n,) the projected data If `self.right` is True, projection will be on the right singular functions. Otherwise, projection will be on the left singular functions. """ # TODO: in principle get_output should not return data for *all* frames! # TODO: implement our own iterators? This would also include random access to be complete... if self.right: X_meanfree = X - self._model.mean_t Y = np.dot(X_meanfree, self._model.V[:, 0:self.dimension()]) else: X_meanfree = X - self._model.mean_0 Y = np.dot(X_meanfree, self._model.U[:, 0:self.dimension()]) return Y.astype(self.output_type()) @property def singular_values(self): r"""Singular values of the half-weighted Koopman matrix (usually denoted :math:`\sigma`) Returns ------- singular values: 1-D np.array """ return self._model.singular_values @property def singular_vectors_right(self): r"""Transformation matrix that represents the linear map from feature space to the space of right singular functions. Notes ----- Right "singular vectors" V of the VAMP problem (equation 13 in [1]_), columnwise Returns ------- vectors: 2-D ndarray Coefficients that express the right singular functions in the basis of mean-free input features. References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 """ return self._model.V @property def singular_vectors_left(self): r"""Transformation matrix that represents the linear map from feature space to the space of left singular functions. Notes ----- Left "singular vectors" U of the VAMP problem (equation 13 in [1]_), columnwise Returns ------- vectors: 2-D ndarray Coefficients that express the left singular functions in the basis of mean-free input features. References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 """ return self._model.U @property def cumvar(self): r"""Cumulative sum of the squared and normalized singular values Returns ------- cumvar: 1D np.array """ return self._model.cumvar @property def show_progress(self): if self._covar is None: return False else: return self._covar.show_progress @show_progress.setter def show_progress(self, value): if self._covar is not None: self._covar.show_progress = value def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False, statistics_mean_free=False): r"""Compute future expectation of observable or covariance using the approximated Koopman operator. Parameters ---------- observables : np.ndarray((input_dimension, n_observables)) Coefficients that express one or multiple observables in the basis of the input features. statistics : np.ndarray((input_dimension, n_statistics)), optional Coefficients that express one or multiple statistics in the basis of the input features. This parameter can be None. In that case, this method returns the future expectation value of the observable(s). lag_multiple : int If > 1, extrapolate to a multiple of the estimator's lag time by assuming Markovianity of the approximated Koopman operator. observables_mean_free : bool, default=False If true, coefficients in `observables` refer to the input features with feature means removed. If false, coefficients in `observables` refer to the unmodified input features. statistics_mean_free : bool, default=False If true, coefficients in `statistics` refer to the input features with feature means removed. If false, coefficients in `statistics` refer to the unmodified input features. Notes ----- A "future expectation" of a observable g is the average of g computed over a time window that has the same total length as the input data from which the Koopman operator was estimated but is shifted by lag_multiple*tau time steps into the future (where tau is the lag time). It is computed with the equation: .. math:: \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1} where .. math:: P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}} and .. math:: q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}} and :math:`\mathbf{e}_{1}` is the first canonical unit vector. A model prediction of time-lagged covariances between the observable f and the statistic g at a lag-time of lag_multiple*tau is computed with the equation: .. math:: \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r} where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` . """ return self._model.expectation( observables, statistics, lag_multiple=lag_multiple, statistics_mean_free=statistics_mean_free, observables_mean_free=observables_mean_free) def cktest(self, n_observables=None, observables='phi', statistics='psi', mlags=10, n_jobs=1, show_progress=True, iterable=None): r"""Do the Chapman-Kolmogorov test by computing predictions for higher lag times and by performing estimations at higher lag times. Notes ----- This method computes two sets of time-lagged covariance matrices * estimates at higher lag times : .. math:: \left\langle \mathbf{K}(n\tau)g_{i},f_{j}\right\rangle_{\rho_{0}} where :math:`\rho_{0}` is the empirical distribution implicitly defined by all data points from time steps 0 to T-tau in all trajectories, :math:`\mathbf{K}(n\tau)` is a rank-reduced Koopman matrix estimated at the lag-time n*tau and g and f are some functions of the data. Rank-reduction of the Koopman matrix is controlled by the `dim` parameter of :func:`vamp <pyemma.coordinates.vamp>`. * predictions at higher lag times : .. math:: \left\langle \mathbf{K}^{n}(\tau)g_{i},f_{j}\right\rangle_{\rho_{0}} where :math:`\mathbf{K}^{n}` is the n'th power of the rank-reduced Koopman matrix contained in self. The Champan-Kolmogorov test is to compare the predictions to the estimates. Parameters ---------- n_observables : int, optional, default=None Limit the number of default observables (and of default statistics) to this number. Only used if `observables` are None or `statistics` are None. observables : np.ndarray((input_dimension, n_observables)) or 'phi' Coefficients that express one or multiple observables :math:`g` in the basis of the input features. This parameter can be 'phi'. In that case, the dominant right singular functions of the Koopman operator estimated at the smallest lag time are used as default observables. statistics : np.ndarray((input_dimension, n_statistics)) or 'psi' Coefficients that express one or multiple statistics :math:`f` in the basis of the input features. This parameter can be 'psi'. In that case, the dominant left singular functions of the Koopman operator estimated at the smallest lag time are used as default statistics. mlags : int or int-array, default=10 multiples of lag times for testing the Model, e.g. range(10). A single int will trigger a range, i.e. mlags=10 maps to mlags=range(10). Note that you need to be able to do a model prediction for each of these lag time multiples, e.g. the value 0 only make sense if model.expectation(lag_multiple=0) will work. n_jobs : int, default=1 how many jobs to use during calculation show_progress : bool, default=True Show progressbars for calculation? iterable : any data format that `pyemma.coordinates.vamp()` accepts as input, optional It `iterable` is None, the same data source with which VAMP was initialized will be used for all estimation. Otherwise, all estimates (not predictions) from data will be computed from the data contained in `iterable`. Returns ------- vckv : :class:`VAMPChapmanKolmogorovValidator <pyemma.coordinates.transform.VAMPChapmanKolmogorovValidator>` Contains the estimated and the predicted covarince matrices. The object can be plotted with :func:`plot_cktest <pyemma.plots.plot_cktest>` with the option `y01=False`. """ if n_observables is not None: if n_observables > self.dimension(): warnings.warn( 'Selected singular functions as observables but dimension ' 'is lower than requested number of observables.') n_observables = self.dimension() else: n_observables = self.dimension() if isinstance(observables, str) and observables == 'phi': observables = self.singular_vectors_right[:, 0:n_observables] observables_mean_free = True else: ensure_ndarray(observables, ndim=2) observables_mean_free = False if isinstance(statistics, str) and statistics == 'psi': statistics = self.singular_vectors_left[:, 0:n_observables] statistics_mean_free = True else: ensure_ndarray_or_None(statistics, ndim=2) statistics_mean_free = False ck = VAMPChapmanKolmogorovValidator(self.model, self, observables, statistics, observables_mean_free, statistics_mean_free, mlags=mlags, n_jobs=n_jobs, show_progress=show_progress) if iterable is None: iterable = self.data_producer ck.estimate(iterable) return ck def score(self, test_data=None, score_method='VAMP2'): """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data. Parameters ---------- test_data : any data format that `pyemma.coordinates.vamp()` accepts as input If `test_data` is not None, this method computes the cross-validation score between self and a VAMP model estimated from `test_data`. It is assumed that self was estimated from the "training" data and `test_data` is the test data. The score is computed for one realization of self and `test_data`. Estimation of the average cross-validation score and partitioning of data into test and training part is not performed by this method. If `test_data` is None, this method computes the VAMP score for the model contained in self. The model that is estimated from `test_data` will inherit all hyperparameters from self. score_method : str, optional, default='VAMP2' Available scores are based on the variational approach for Markov processes [1]_: * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the sum of Koopman matrix eigenvalues, also called Rayleigh quotient [1]_. * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the kinetic variance [2]_ . * 'VAMPE' Approximation error of the estimated Koopman operator with respect to the true Koopman operator up to an additive constant [1]_ . Returns ------- score : float If `test_data` is not None, returns the cross-validation VAMP score between self and the model estimated from `test_data`. Otherwise return the selected VAMP-score of self. References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 """ from pyemma._ext.sklearn.base import clone as clone_estimator est = clone_estimator(self) if test_data is None: return self.model.score(None, score_method=score_method) else: est.estimate(test_data) return self.model.score(est.model, score_method=score_method)