def bootstrap_counts(dtrajs, lagtime, corrlength=None): """ Generates a randomly resampled count matrix given the input coordinates. See API function for full documentation. """ from scipy.stats import rv_discrete # if we have just one trajectory, put it into a one-element list: if not isinstance(dtrajs, list): dtrajs = [dtrajs] ntraj = len(dtrajs) # can we do the estimate? lengths = determine_lengths(dtrajs) Lmax = np.max(lengths) Ltot = np.sum(lengths) if lagtime >= Lmax: raise ValueError('Cannot estimate count matrix: lag time ' + str(lagtime) + ' is longer than the longest trajectory length ' + str(Lmax)) # how many counts can we sample? if corrlength is None: corrlength = lagtime nsample = int(Ltot / corrlength) # determine number of states n from deeptime.markov import number_of_states n = number_of_states(dtrajs) # assigning trajectory sampling weights w_trajs = np.maximum(0.0, lengths - lagtime) w_trajs /= np.sum(w_trajs) # normalize to sum 1.0 distrib_trajs = rv_discrete(values=(list(range(ntraj)), w_trajs)) # sample number of counts from each trajectory n_from_traj = np.bincount(distrib_trajs.rvs(size=nsample), minlength=ntraj) # for each trajectory, sample counts and stack them rows = np.zeros((nsample,)) cols = np.zeros((nsample,)) ones = np.ones((nsample,)) ncur = 0 for i in range(len(n_from_traj)): if n_from_traj[i] > 0: (r, c) = bootstrap_counts_singletraj(dtrajs[i], lagtime, n_from_traj[i]) rows[ncur:ncur + n_from_traj[i]] = r cols[ncur:ncur + n_from_traj[i]] = c ncur += n_from_traj[i] # sum over counts Csparse = scipy.sparse.coo_matrix((ones, (rows, cols)), shape=(n, n)) return Csparse.tocsr()
def _split_sequences_multitraj(dtrajs, lag): """ splits the discrete trajectories into conditional sequences by starting state Parameters ---------- dtrajs : list of int-iterables discrete trajectories lag : int lag time """ from deeptime.markov import number_of_states n = number_of_states(dtrajs) res = [] for i in range(n): res.append([]) for dtraj in dtrajs: states, seqs = _split_sequences_singletraj(dtraj, n, lag) for i in range(len(states)): res[states[i]].append(seqs[i]) return res
def __init__(self, dtrajs): from pyemma.util.types import ensure_dtraj_list # discrete trajectories self._dtrajs = ensure_dtraj_list(dtrajs) # TODO: extensive input checking! if any([np.any(d < -1) for d in self._dtrajs]): raise ValueError('Discrete trajectory contains elements < -1.') ## basic count statistics # histogram self._hist = count_states(self._dtrajs, ignore_negative=True) # total counts self._total_count = np.sum(self._hist) # number of states self._nstates = number_of_states(dtrajs) # not yet estimated self._counted_at_lag = False
def fit(self, data, n_burn_in: int = 0, n_thin: int = 1, progress=None, **kwargs): r""" Sample from the posterior. Parameters ---------- data : array_like or list of array_like Input time series data. n_burn_in : int, optional, default=0 The number of samples to discard to burn-in, following which :attr:`n_samples` samples will be generated. n_thin : int, optional, default=1 The number of Gibbs sampling updates used to generate each returned sample. progress : iterable, optional, default=None Optional progressbar. Tested for tqdm. **kwargs Ignored kwargs for scikit-learn compatibility. Returns ------- self : BayesianHMM Reference to self. """ progress = handle_progress_bar(progress) dtrajs = ensure_dtraj_list(data) # fetch priors tmat = self.initial_hmm.transition_model.transition_matrix transition_matrix_prior = self._transition_matrix_prior_np initial_distribution_prior = self._initial_distribution_prior_np model = BayesianHMMPosterior() # update HMM Model model.prior = self.initial_hmm.copy() prior = model.prior # check if we are strongly connected in the reversible case (plus prior) if self.reversible and not is_connected(tmat + transition_matrix_prior, directed=True): raise NotImplementedError( 'Trying to sample disconnected HMM with option reversible:\n ' f'{tmat}\n Use prior to connect, select connected subset, ' f'or use reversible=False.') # EVALUATE STRIDE dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, lagtime=prior.lagtime, n_states=prior.n_hidden_states, stride=self.stride) # if stride is different to init_hmm, check if microstates in lagged-strided trajs are compatible if self.stride != self.initial_hmm.stride: symbols = np.unique(np.concatenate(dtrajs_lagged_strided)) if not len( np.intersect1d(self.initial_hmm.observation_symbols, symbols)) == len(symbols): raise ValueError( 'Choice of stride has excluded a different set of microstates than in ' 'init_hmm. Set of observed microstates in time-lagged strided trajectories ' 'must match to the one used for init_hmm estimation.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. n_states_full = number_of_states(dtrajs_lagged_strided) if prior.n_observation_states < n_states_full: eps = 0.01 / n_states_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns full_obs_probabilities = eps * np.ones( (prior.n_hidden_states, n_states_full), dtype=np.float64) # fill active states full_obs_probabilities[:, prior.observation_symbols] = np.maximum( eps, prior.output_probabilities) # renormalize B to make it row-stochastic full_obs_probabilities /= full_obs_probabilities.sum(axis=1)[:, None] else: full_obs_probabilities = prior.output_probabilities maxT = max(len(o) for o in dtrajs_lagged_strided) # pre-construct hidden variables temp_alpha = np.zeros((maxT, prior.n_hidden_states)) has_all_obs_symbols = model.prior.n_observation_states == len( model.prior.observation_symbols_full) try: # sample model is basically copy of prior sample_model = BayesianHMM._SampleStorage( transition_matrix=prior.transition_model.transition_matrix. copy(), output_model=DiscreteOutputModel( full_obs_probabilities.copy()), initial_distribution=prior.initial_distribution.copy(), stationary_distribution=prior.transition_model. stationary_distribution.copy(), counts=prior.count_model.count_matrix.copy(), hidden_trajs=[]) # Run burn-in. for _ in range(n_burn_in): self._update(sample_model, dtrajs_lagged_strided, temp_alpha, transition_matrix_prior, initial_distribution_prior) # Collect data. models = [] for _ in progress(range(self.n_samples), desc="Drawing samples", leave=False): # Run a number of Gibbs sampling updates to generate each sample. for _ in range(n_thin): self._update(sample_model, dtrajs_lagged_strided, temp_alpha, transition_matrix_prior, initial_distribution_prior) sample_model.output_model.normalize() self._append_sample(models, prior, sample_model) if not has_all_obs_symbols: models = [ m.submodel(states=None, obs=model.prior.observation_symbols) for m in models ] model.samples = models finally: del temp_alpha # set new model self._model = model return self
def _estimate(self, dtrajs): # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [_np.size(dtraj) for dtraj in dtrajs] if self.lag >= _np.max(trajlengths): raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length') if self.lag > _np.mean(trajlengths): self.logger.warning( 'Lag time ' + str(self.lag) + ' is on the order of mean trajectory length ' + str(_np.mean(trajlengths)) + '. It is recommended to fit four lag times in each ' + 'trajectory. HMM might be inaccurate.') # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the non-reversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', category=ImaginaryEigenValueWarning, module= 'deeptime.markov.tools.analysis.dense.decomposition') corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # LAG AND STRIDE DATA from deeptime.markov import compute_dtrajs_effective dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs, self.lag, n_states=-1, stride=self.stride) # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # INIT HMM from deeptime.markov.hmm import init from pyemma.msm.estimators import MaximumLikelihoodMSM from pyemma.msm.estimators import OOMReweightedMSM if self.msm_init == 'largest-strong': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, mode='largest-regularized', reversible=self.reversible, stationary=True, separate_symbols=self.separate) elif self.msm_init == 'all': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=True, separate_symbols=self.separate, mode='all-regularized') elif isinstance( self.msm_init, (MaximumLikelihoodMSM, OOMReweightedMSM)): # initial MSM given. msm = MarkovStateModel(transition_matrix=self.msm_init.P, count_model=TransitionCountModel( self.msm_init.count_matrix_active)) hmm_init = init.discrete.metastable_from_msm( msm, n_hidden_states=self.nstates, reversible=self.reversible, stationary=True, separate_symbols=self.separate) observe_subset = self.msm_init.active_set # override observe_subset. else: raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # run EM from deeptime.markov.hmm import MaximumLikelihoodHMM hmm_est = MaximumLikelihoodHMM(hmm_init, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) # run hmm_est.fit(dtrajs) # package in discrete HMM self.hmm = hmm_est.fetch_model() # get model parameters self.initial_distribution = self.hmm.initial_distribution transition_matrix = self.hmm.transition_model.transition_matrix observation_probabilities = self.hmm.output_probabilities # get estimation parameters self.likelihoods = self.hmm.likelihoods # Likelihood history self.likelihood = self.likelihoods[-1] self.hidden_state_probabilities = self.hmm.state_probabilities # gamma variables self.hidden_state_trajectories = self.hmm.hidden_state_trajectories # Viterbi path self.count_matrix = self.hmm.count_model.count_matrix # hidden count matrix self.initial_count = self.hmm.initial_count # hidden init count self._active_set = _np.arange(self.nstates) # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards? # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = number_of_states(dtrajs) self._nstates_obs = number_of_states(dtrajs_lagged_strided) self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) # TODO: perhaps remove connectivity and just rely on .submodel()? # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity, inplace=True)
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) if self.init_hmsm is None: # estimate using maximum-likelihood superclass # memorize the observation state for bhmm and reset # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs) default_connectivity = self.connectivity default_mincount_connectivity = self.mincount_connectivity default_observe_nonempty = self.observe_nonempty self.connectivity = None self.observe_nonempty = False self.mincount_connectivity = 0 self.accuracy = 1e-2 # this is sufficient for an initial guess super(BayesianHMSM, self)._estimate(dtrajs) self.connectivity = default_connectivity self.mincount_connectivity = default_mincount_connectivity self.observe_nonempty = default_observe_nonempty else: # if given another initialization, must copy its attributes copy_attributes = [ '_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood', 'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix', 'initial_count', 'initial_distribution', '_active_set' ] check_user_choices = ['lag', '_nstates'] # check if nstates and lag are compatible for attr in check_user_choices: if not getattr(self, attr) == getattr(self.init_hmsm, attr): raise UserWarning( 'BayesianHMSM cannot be initialized with init_hmsm with ' 'incompatible lag or nstates.') if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or not all( (_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))): raise NotImplementedError( 'Bayesian HMM estimation with init_hmsm is currently only implemented ' + 'if applied to the same data.') # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM. # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the nonreversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible if self.stride != self.init_hmsm.stride: from deeptime.markov import compute_dtrajs_effective dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, lagtime=self.lag, n_states=-1, stride=self.stride) _nstates_obs = number_of_states(dtrajs_lagged_strided, only_used=True) _nstates_obs_full = number_of_states(dtrajs) if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided), _np.concatenate( self.init_hmsm._dtrajs_lagged)).size != 0: raise UserWarning( 'Choice of stride has excluded a different set of microstates than in ' 'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' 'must match to the one used for init_hmsm estimation.') self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = _nstates_obs_full self._nstates_obs = _nstates_obs self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs else: copy_attributes += [ '_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full', '_nstates_obs', '_observable_set', '_dtrajs_obs' ] # update self with estimates from init_hmsm self.__dict__.update({ k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes }) # as mentioned in the docstring, take init_hmsm observed set observation probabilities self.observe_nonempty = False # update HMM Model self.update_model_params( P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities, dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag)) # check if we have a valid initial model if self.reversible and not is_connected(self.count_matrix): raise NotImplementedError( 'Encountered disconnected count matrix:\n{count_matrix} ' 'with reversible Bayesian HMM sampler using lag={lag}' ' and stride={stride}. Consider using shorter lag, ' 'or shorter stride (to use more of the data), ' 'or using a lower value for mincount_connectivity.'.format( count_matrix=self.count_matrix, lag=self.lag, stride=self.stride)) # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. nstates_full = number_of_states(dtrajs) if self.nstates_obs < nstates_full: eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * _np.ones( (self.nstates, nstates_full), dtype=_np.float64) # fill active states B_init[:, self.observable_set] = _np.maximum( eps, self.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = self.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) from deeptime.util.callbacks import ProgressCallback outer_self = self class BHMMCallback(ProgressCallback): def __call__(self, inc=1, *args, **kw): super().__call__(inc, *args, **kw) outer_self._progress_update(1, stage=0) progress = BHMMCallback else: progress = None from deeptime.markov.hmm import BayesianHMM if self.init_hmsm is not None: hmm_mle = self.init_hmsm.hmm estimator = BayesianHMM( hmm_mle, n_samples=self.nsamples, stride=self.stride, initial_distribution_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, reversible=self.reversible, stationary=self.stationary) else: estimator = BayesianHMM.default( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, n_samples=self.nsamples, stride=self.stride, initial_distribution_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, reversible=self.reversible, stationary=self.stationary, prior_submodel=True, separate=self.separate) estimator.fit(dtrajs, n_burn_in=0, n_thin=1, progress=progress) model = estimator.fetch_model() if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_inp = [(m.transition_model.transition_matrix, m.transition_model.stationary_distribution, m.output_probabilities) for m in model.samples] samples = [] for P, pi, pobs in sample_inp: # restrict to observable set if necessary Bobs = pobs[:, self.observable_set] pobs = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model)) # store results self.sampled_trajs = [ model.samples[i].hidden_state_trajectories for i in range(self.nsamples) ] self.update_model_params(samples=samples) # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity, inplace=True)
def count_matrix_coo2_mult(dtrajs, lag, sliding=True, sparse=True, nstates=None): r"""Generate a count matrix from a given list discrete trajectories. The generated count matrix is a sparse matrix in compressed sparse row (CSR) or numpy ndarray format. Parameters ---------- dtraj : list of ndarrays discrete trajectories lag : int Lagtime in trajectory steps sliding : bool, optional If true the sliding window approach is used for transition counting sparse : bool (optional) Whether to return a dense or a sparse matrix nstates : int, optional Enforce a count-matrix with shape=(nstates, nstates). If there are more states in the data, this will lead to an exception. Returns ------- C : scipy.sparse.csr_matrix or numpy.ndarray The countmatrix at given lag in scipy compressed sparse row or numpy ndarray format. """ # Determine number of states if nstates is None: from deeptime.markov import number_of_states nstates = number_of_states(dtrajs) rows = [] cols = [] # collect transition index pairs for dtraj in dtrajs: if dtraj.size > lag: if (sliding): rows.append(dtraj[0:-lag]) cols.append(dtraj[lag:]) else: rows.append(dtraj[0:-lag:lag]) cols.append(dtraj[lag::lag]) # is there anything? if len(rows) == 0: raise ValueError('No counts found - lag ' + str(lag) + ' may exceed all trajectory lengths.') # feed into one COO matrix row = np.concatenate(rows) col = np.concatenate(cols) data = np.ones(row.size) C = scipy.sparse.coo_matrix((data, (row, col)), shape=(nstates, nstates)) # export to output format if sparse: return C.tocsr() else: return C.toarray()