예제 #1
0
 def test_evaluate_msm(self):
     from pyemma.msm.estimators import MaximumLikelihoodMSM
     dtraj = [0, 0, 1, 2, 1, 0, 1, 0, 1, 2, 2, 0, 0, 0, 1, 1, 2, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1,
              2]  # mini-trajectory
     param_sets = param_grid({'lag': [1, 2, 3]})
     res = estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales', n_jobs=1)
     self.assertIsInstance(res, list)
예제 #2
0
    def _estimate(self, dtrajs):
        ### PREPARE AND CHECK DATA
        # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed.
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # check trajectory lengths
        if self._estimated:
            # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity.
            assert hasattr(self, '_last_dtrajs_input_hash')
            current_hash = _hash_dtrajs(dtrajs)
            if self._last_dtrajs_input_hash != current_hash:
                self.logger.warning("estimating from new data, discard all previously computed models.")
                self._estimated = False
                self._last_dtrajs_input_hash = current_hash
        else:
            self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs)

        self._trajlengths = np.fromiter((len(traj) for traj in dtrajs), dtype=int, count=len(dtrajs))
        maxlength = np.max(self._trajlengths)

        # set lag times by data if not yet set
        if self._lags is None:
            maxlag = 0.5 * np.sum(self._trajlengths) / float(len(self._trajlengths))
            self._lags = _generate_lags(maxlag, 1.5)

        # check if some lag times are forbidden.
        if np.max(self._lags) >= maxlength:
            Ifit = np.where(self._lags < maxlength)[0]
            Inofit = np.where(self._lags >= maxlength)[0]
            self.logger.warning('Ignoring lag times that exceed the longest trajectory: %s', self._lags[Inofit])
            self._lags = self._lags[Ifit]

        ### RUN ESTIMATION
        if self._estimated:
            # we already had run an estimation, determine which lag times we need to compute
            # TODO: this will re-evaluate problematic lag times, wont it?
            lags = sorted(list(set(self._lags).difference(self._last_lags)))
            if len(lags) == 0:
                self.logger.info("All lag times already estimated.")
                return self
            assert lags
            self.logger.info("Running estimating for not yet estimated lags times: %s", lags)
        else:
            lags = self._lags

        # construct all parameter sets for the estimator
        param_sets = tuple(param_grid({'lag': lags}))

        # run estimation on all lag times
        pg = ProgressReporter()
        with pg.context():
            models, estimators = estimate_param_scan(self.estimator, dtrajs, param_sets, failfast=False,
                                                     return_estimators=True, n_jobs=self.n_jobs,
                                                     progress_reporter=pg, return_exceptions=True)
        self._estimators = estimators

        self._postprocess_results(models)
        return self
예제 #3
0
 def test_evaluate_bmsm_single_arg(self):
     from pyemma.msm.estimators import BayesianMSM
     dtraj = [0, 0, 1, 2, 1, 0, 1, 0, 1, 2, 2, 0, 0, 0, 1, 1, 2, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1,
              2]  # mini-trajectory
     n_samples = 52
     param_sets = param_grid({'lag': [1, 2, 3], 'show_progress': (False, ), 'nsamples': (n_samples, )})
     res = estimate_param_scan(BayesianMSM, dtraj, param_sets,
                               evaluate='sample_f', evaluate_args='timescales', n_jobs=1)
     self.assertIsInstance(res, list)
     self.assertEqual(len(res), 3)  # three lag times
     self.assertEqual(len(res[0]), n_samples)
예제 #4
0
    def test_evaluate_msm_multi_arg(self):
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        dtraj = [0, 0, 1, 2, 1, 0, 1, 0, 1, 2, 2, 0, 0, 0, 1, 1, 2, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1,
                 2]  # mini-trajectory
        traj_len = 10
        param_sets = param_grid({'lag': [1, 2, 3]})
        #     def generate_traj(self, N, start=None, stop=None, stride=1):

        res = estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets,
                                  evaluate='generate_traj', evaluate_args=((traj_len, 2, None, 2), ), n_jobs=1)
        self.assertIsInstance(res, list)
        self.assertEqual(len(res), 3)  # three lag times
        self.assertTrue(all(len(x) == traj_len for x in res))
예제 #5
0
    def test_3gaussian_1d_singletraj(self):
        # generate 1D data from three gaussians

        from pyemma.util.contexts import numpy_random_seed
        with numpy_random_seed(42):
            X = [np.random.randn(200)-2.0,
                 np.random.randn(200),
                 np.random.randn(200)+2.0]
        X = np.hstack(X)
        k = 50
        from pyemma._base.estimator import param_grid
        grid = param_grid({'init_strategy': ['uniform', 'kmeans++'], 'fixed_seed': [True, 463498]})
        for param in grid:
            init_strategy = param['init_strategy']
            fixed_seed = param['fixed_seed']
            kmeans = cluster_kmeans(X, k=k, init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=1)
            cc = kmeans.clustercenters
            self.assertTrue(np.all(np.isfinite(cc)), "cluster centers borked for strat %s" % init_strategy)
            assert (np.any(cc < 1.0)), "failed for init_strategy=%s" % init_strategy
            assert (np.any((cc > -1.0) * (cc < 1.0))), "failed for init_strategy=%s" % init_strategy
            assert (np.any(cc > -1.0)), "failed for init_strategy=%s" % init_strategy

            km1 = cluster_kmeans(X, k=k, init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=1)
            km2 = cluster_kmeans(X, k=k, init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=1)
            self.assertEqual(len(km1.clustercenters), k)
            self.assertEqual(len(km2.clustercenters), k)
            self.assertEqual(km1.fixed_seed, km2.fixed_seed)

            # check initial centers (after kmeans++, uniform init) are equal.
            np.testing.assert_equal(km1.initial_centers_, km2.initial_centers_)

            while not km1.converged:
                km1.estimate(X=X, clustercenters=km1.clustercenters, keep_data=True)
            while not km2.converged:
                km2.estimate(X=X, clustercenters=km2.clustercenters, keep_data=True)

            assert np.linalg.norm(km1.clustercenters - km1.initial_centers_) > 0
            np.testing.assert_allclose(km1.clustercenters, km2.clustercenters,
                                       err_msg="should yield same centers with fixed seed=%s for strategy %s, Initial centers=%s"
                                               % (fixed_seed, init_strategy, km2.initial_centers_), atol=1e-6)
예제 #6
0
    def _estimate(self, data):
        # lag times
        self._lags = np.array(self.mlags) * self.test_estimator.lag
        pargrid = list(param_grid({'lag': self._lags}))
        # do we have zero lag? this must be treated separately
        include0 = self.mlags[0] == 0
        if include0:
            pargrid = pargrid[1:]

        self._pred = []
        self._pred_L = []
        self._pred_R = []

        self._est = []
        self._est_L = []
        self._est_R = []

        # clone estimators and run estimates

        if self.show_progress:
            if isinstance(self.test_estimator, SampledModel):
                self.test_estimator.show_progress = False
            progress_reporter = self
        else:
            progress_reporter = None

        estimated_models, estimators = \
            estimate_param_scan(self.test_estimator, data, pargrid, return_estimators=True, failfast=False,
                                progress_reporter=progress_reporter, n_jobs=self.n_jobs)
        if include0:
            estimated_models = [None] + estimated_models
            estimators = [None] + estimators

        for i, mlag in enumerate(self.mlags):
            # make a prediction using the current model
            self._pred.append(self._compute_observables(self.test_model, self.test_estimator, mlag))
            # compute prediction errors if we can
            if self.has_errors:
                l, r = self._compute_observables_conf(self.test_model, self.test_estimator, mlag)
                self._pred_L.append(l)
                self._pred_R.append(r)

            # do an estimate at this lagtime
            model = estimated_models[i]
            estimator = estimators[i]
            self._est.append(self._compute_observables(model, estimator))
            if self.has_errors and self.err_est:
                l, r = self._compute_observables_conf(model, estimator)
                self._est_L.append(l)
                self._est_R.append(r)

        # build arrays
        self._est = np.array(self._est)
        self._pred = np.array(self._pred)
        if self.has_errors:
            self._pred_L = np.array(self._pred_L)
            self._pred_R = np.array(self._pred_R)
        else:
            self._pred_L = None
            self._pred_R = None
        if self.has_errors and self.err_est:
            self._est_L = np.array(self._est_L)
            self._est_R = np.array(self._est_R)
        else:
            self._est_L = None
            self._est_R = None

        return self
예제 #7
0
    def _estimate(self, data):
        r"""Estimates ITS at set of lagtimes

        """
        ### PREPARE AND CHECK DATA
        # TODO: Currenlty only discrete trajectories are implemented. For a general class this needs to be changed.
        data = _types.ensure_dtraj_list(data)

        # check trajectory lengths
        self._trajlengths = np.array([len(traj) for traj in data])
        maxlength = np.max(self._trajlengths)

        # set lag times by data if not yet set
        if self._lags is None:
            maxlag = 0.5 * np.sum(self._trajlengths) / float(
                len(self._trajlengths))
            self._lags = _generate_lags(maxlag, 1.5)

        # check if some lag times are forbidden.
        if np.max(self._lags) >= maxlength:
            Ifit = np.where(self._lags < maxlength)[0]
            Inofit = np.where(self._lags >= maxlength)[0]
            self.logger.warning(
                'Ignoring lag times that exceed the longest trajectory: ' +
                str(self._lags[Inofit]))
            self._lags = self._lags[Ifit]

        ### RUN ESTIMATION

        # construct all parameter sets for the estimator
        param_sets = tuple(param_grid({'lag': self._lags}))

        if isinstance(self.estimator, SampledModel):
            self.estimator.show_progress = False

        # run estimation on all lag times
        self._models, self._estimators = estimate_param_scan(
            self.estimator,
            data,
            param_sets,
            failfast=False,
            return_estimators=True,
            n_jobs=self.n_jobs,
            progress_reporter=self)

        ### PROCESS RESULTS
        # if some results are None, estimation has failed. Warn and truncate models and lag times
        good = np.array(
            [i for i, m in enumerate(self._models) if m is not None],
            dtype=int)
        bad = np.array([i for i, m in enumerate(self._models) if m is None],
                       dtype=int)
        if good.size == 0:
            raise RuntimeError(
                'Estimation has failed at ALL lagtimes. Check for errors.')
        if bad.size > 0:
            self.logger.warning(
                'Estimation has failed at lagtimes: ' + str(self._lags[bad]) +
                '. Run single-lag estimation at these lags to track down the error.'
            )
            self._lags = self._lags[good]
            self._models = list(np.array(self._models)[good])

        # timescales
        timescales = [m.timescales() for m in self._models]

        # how many finite timescales do we really have?
        maxnts = max([len(ts[np.isfinite(ts)]) for ts in timescales])
        if self.nits is None:
            self.nits = maxnts
        if maxnts < self.nits:
            self.nits = maxnts
            self.logger.warning(
                'Changed user setting nits to the number of available timescales nits='
                + str(self.nits))

        # sort timescales into matrix
        computed_all = True  # flag if we have found any problems
        self._its = np.empty((len(self._lags), self.nits))
        self._its[:] = np.NAN  # initialize with NaN in order to point out timescales that were not computed
        self._successful_lag_indexes = []
        for i, ts in enumerate(timescales):
            if ts is not None:
                if np.any(
                        np.isfinite(ts)
                ):  # if there are any finite timescales available, add them
                    self._its[i, :len(
                        ts
                    )] = ts[:self.
                            nits]  # copy into array. Leave NaN if there is no timescale
                    self._successful_lag_indexes.append(i)

        if len(self._successful_lag_indexes) < len(self._lags):
            computed_all = False
        if np.any(np.isnan(self._its)):
            computed_all = False

        # timescales samples if available
        if issubclass(self._models[0].__class__, SampledModel):
            # samples
            timescales_samples = [
                m.sample_f('timescales') for m in self._models
            ]
            nsamples = np.shape(timescales_samples[0])[0]
            self._its_samples = np.empty(
                (nsamples, len(self._lags), self.nits))
            self._its_samples[:] = np.NAN  # initialize with NaN in order to point out timescales that were not computed

            for i, ts in enumerate(timescales_samples):
                if ts is not None:
                    ts = np.vstack(ts)
                    ts = ts[:, :self.nits]
                    self._its_samples[:, i, :ts.shape[
                        1]] = ts  # copy into array. Leave NaN if there is no timescales

            if np.any(np.isnan(self._its_samples)):
                computed_all = False

        if not computed_all:
            self.logger.warning(
                'Some timescales could not be computed. Timescales array is smaller than '
                'expected or contains NaNs')