def score(self, test_data=None, score_method='VAMP2'): """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data. Parameters ---------- test_data : any data format that `pyemma.coordinates.vamp()` accepts as input If `test_data` is not None, this method computes the cross-validation score between self and a VAMP model estimated from `test_data`. It is assumed that self was estimated from the "training" data and `test_data` is the test data. The score is computed for one realization of self and `test_data`. Estimation of the average cross-validation score and partitioning of data into test and training part is not performed by this method. If `test_data` is None, this method computes the VAMP score for the model contained in self. The model that is estimated from `test_data` will inherit all hyperparameters from self. score_method : str, optional, default='VAMP2' Available scores are based on the variational approach for Markov processes [1]_: * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the sum of Koopman matrix eigenvalues, also called Rayleigh quotient [1]_. * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ . If the model is reversible, this is equal to the kinetic variance [2]_ . * 'VAMPE' Approximation error of the estimated Koopman operator with respect to the true Koopman operator up to an additive constant [1]_ . Returns ------- score : float If `test_data` is not None, returns the cross-validation VAMP score between self and the model estimated from `test_data`. Otherwise return the selected VAMP-score of self. References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 """ from pyemma._ext.sklearn.base import clone as clone_estimator est = clone_estimator(self) # clone does not invoke our constructor, so we have explicitly create a new model instance. est._model = VAMPModel() if test_data is None: return self.model.score(None, score_method=score_method) else: est.estimate(test_data) return self.model.score(est.model, score_method=score_method)
def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=None, failfast=True, return_estimators=False, n_jobs=1, progress_reporter=None, show_progress=True, return_exceptions=False): """ Runs multiple estimations using a list of parameter settings Parameters ---------- estimator : Estimator object or class An estimator object that provides an estimate(X, **params) function. If only a class is provided here, the Estimator objects will be constructed with default parameter settings, and the parameter settings from param_sets for each estimation. If you want to specify other parameter settings for those parameters not specified in param_sets, construct an Estimator before and pass the object. param_sets : iterable over dictionaries An iterable that provides parameter settings. Each element defines a parameter set, for which an estimation will be run using these parameters in estimate(X, **params). All other parameter settings will be taken from the default settings in the estimator object. evaluate : str or list of str, optional The given methods or properties will be called on the estimated models, and their results will be returned instead of the full models. This may be useful for reducing memory overhead. evaluate_args: iterable of iterable, optional Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate. failfast : bool If True, will raise an exception when estimation failed with an exception or trying to calls a method that doesn't exist. If False, will simply return None in these cases. return_estimators: bool If True, return a list estimators in addition to the models. show_progress: bool if the given estimator supports show_progress interface, we set the flag prior doing estimations. return_exceptions: bool, default=False if failfast is False while this setting is True, returns the exception thrown at the actual grid element, instead of None. Returns ------- models : list of model objects or evaluated function values A list of estimated models in the same order as param_sets. If evaluate is given, each element will contain the results from these method evaluations. estimators (optional) : list of estimator objects. These are returned only if return_estimators=True Examples -------- Estimate a maximum likelihood Markov model at lag times 1, 2, 3. >>> from pyemma.msm.estimators import MaximumLikelihoodMSM, BayesianMSM >>> >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2] # mini-trajectory >>> param_sets=param_grid({'lag': [1,2,3]}) >>> >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales') [array([ 1.24113168, 0.77454377]), array([ 2.65266698, 1.42909842]), array([ 5.34810405, 1.14784446])] Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False, ... evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP [[array([ 1.24113168, 0.77454377]), None], [array([ 2.48226337, 1.54908754]), None], [array([ 3.72339505, 2.32363131]), None]] We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example a Bayesian estimator for that. Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False, ... evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP [[array([ 1.24357685, 0.77609028]), [array([ 1.5963252 , 0.73877883]), array([ 1.29915847, 0.49004912]), array([ 0.90058583, 0.73841786]), ... ]] """ # make sure we have an estimator object estimator = get_estimator(estimator) if hasattr(estimator, 'show_progress'): estimator.show_progress = show_progress # if we want to return estimators, make clones. Otherwise just copy references. # For parallel processing we always need clones. # Also if the Estimator is its own Model, we have to clone. from pyemma._base.model import Model if (return_estimators or n_jobs > 1 or n_jobs is None or isinstance(estimator, Model)): estimators = [clone_estimator(estimator) for _ in param_sets] else: estimators = [estimator for _ in param_sets] # if we evaluate, make sure we have a list of functions to evaluate if _types.is_string(evaluate): evaluate = [evaluate] if _types.is_string(evaluate_args): evaluate_args = [evaluate_args] if evaluate is not None and evaluate_args is not None and len( evaluate) != len(evaluate_args): raise ValueError( "length mismatch: evaluate ({}) and evaluate_args ({})".format( len(evaluate), len(evaluate_args))) show_progress = progress_reporter is not None and show_progress if show_progress: progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) if n_jobs > 1 and os.name == 'posix': if hasattr(estimators[0], 'logger'): estimators[0].logger.debug('estimating %s with n_jobs=%s', estimator, n_jobs) # iterate over parameter settings task_iter = ((estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions) for estimator, param_set in zip(estimators, param_sets)) from pathos.multiprocessing import Pool as Parallel pool = Parallel(processes=n_jobs) args = list(task_iter) if show_progress: from pyemma._base.model import SampledModel for a in args: if isinstance(a[0], SampledModel): a[0].show_progress = False def callback(_): progress_reporter._progress_update(1, stage=0) else: callback = None import six if six.PY3: def error_callback(*args, **kw): if failfast: raise Exception('something failed') with pool: res_async = [ pool.apply_async(_estimate_param_scan_worker, a, callback=callback, error_callback=error_callback) for a in args ] res = [x.get() for x in res_async] else: try: res_async = [ pool.apply_async(_estimate_param_scan_worker, a, callback=callback) for a in args ] res = [x.get() for x in res_async] finally: pool.close() # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator else: if hasattr(estimators[0], 'logger'): estimators[0].logger.debug( 'estimating %s with n_jobs=1 because of the setting or ' 'you not have a POSIX system', estimator) res = [] if show_progress: from pyemma._base.model import SampledModel if isinstance(estimator, SampledModel): for e in estimators: e.show_progress = False for estimator, param_set in zip(estimators, param_sets): res.append( _estimate_param_scan_worker(estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions)) if show_progress: progress_reporter._progress_update(1, stage=0) if show_progress: progress_reporter._progress_force_finish(0) # done if return_estimators: return res, estimators else: return res
def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=None, failfast=True, return_estimators=False, n_jobs=1, progress_reporter=None): """ Runs multiple estimations using a list of parameter settings Parameters ---------- estimator : Estimator object or class An estimator object that provides an estimate(X, **params) function. If only a class is provided here, the Estimator objects will be constructed with default parameter settings, and the parameter settings from param_sets for each estimation. If you want to specify other parameter settings for those parameters not specified in param_sets, construct an Estimator before and pass the object. param_sets : iterable over dictionaries An iterable that provides parameter settings. Each element defines a parameter set, for which an estimation will be run using these parameters in estimate(X, **params). All other parameter settings will be taken from the default settings in the estimator object. evaluate : str or list of str The given methods or properties will be called on the estimated models, and their results will be returned instead of the full models. This may be useful for reducing memory overhead. failfast : bool If True, will raise an exception when estimation failed with an exception or trying to calls a method that doesn't exist. If False, will simply return None in these cases. Return ------ models : list of model objects or evaluated function values A list of estimated models in the same order as param_sets. If evaluate is given, each element will contain the results from these method evaluations. estimators (optional) : list of estimator objects. These are returned only if return_estimators=True Examples -------- Estimate a maximum likelihood Markov model at lag times 1, 2, 3. >>> from pyemma.msm.estimators import MaximumLikelihoodMSM >>> >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2] # mini-trajectory >>> param_sets=param_grid({'lag': [1,2,3]}) >>> >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales') [array([ 1.24113167, 0.77454377]), array([ 2.65266703, 1.42909841]), array([ 5.34810395, 1.14784446])] Try also getting samples of the timescales >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate=['timescales', 'timescales_samples']) [[array([ 1.24113167, 0.77454377]), None], [array([ 2.65266703, 1.42909841]), None], [array([ 5.34810395, 1.14784446]), None], We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example a Bayesian estimator for that. """ # make sure we have an estimator object estimator = get_estimator(estimator) # if we want to return estimators, make clones. Otherwise just copy references. # For parallel processing we always need clones if return_estimators or n_jobs > 1 or n_jobs is None: estimators = [clone_estimator(estimator) for _ in param_sets] else: estimators = [estimator for _ in param_sets] # if we evaluate, make sure we have a list of functions to evaluate if _types.is_string(evaluate): evaluate = [evaluate] # set call back for joblib if progress_reporter is not None: progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) if n_jobs > 1: class CallBack(object): def __init__(self, index, parallel): self.index = index self.parallel = parallel self.reporter = progress_reporter def __call__(self, index): if self.reporter is not None: self.reporter._progress_update(1, stage=0) if self.parallel._original_iterable: self.parallel.dispatch_next() import joblib.parallel joblib.parallel.CallBack = CallBack else: def _print(msg, msg_args): # NOTE: this is a ugly hack, because if we only use one job, # we do not get the joblib callback interface, as a workaround # we use the Parallel._print function, which is called with # msg_args = (done_jobs, total_jobs) if len(msg_args) == 2: progress_reporter._progress_update(1, stage=0) # iterate over parameter settings from joblib import Parallel import joblib pool = Parallel(n_jobs=n_jobs) if progress_reporter is not None and n_jobs == 1: pool._print = _print # NOTE: verbose has to be set, otherwise our print hack does not work. pool.verbose = 50 task_iter = (joblib.delayed(_estimate_param_scan_worker)(estimators[i], param_sets[i], X, evaluate, evaluate_args, failfast, ) for i in range(len(param_sets))) # container for model or function evaluations res = pool(task_iter) if progress_reporter is not None: progress_reporter._progress_force_finish(0) # done if return_estimators: return res, estimators else: return res
def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=None, failfast=True, return_estimators=False, n_jobs=1, progress_reporter=None, show_progress=True): """ Runs multiple estimations using a list of parameter settings Parameters ---------- estimator : Estimator object or class An estimator object that provides an estimate(X, **params) function. If only a class is provided here, the Estimator objects will be constructed with default parameter settings, and the parameter settings from param_sets for each estimation. If you want to specify other parameter settings for those parameters not specified in param_sets, construct an Estimator before and pass the object. param_sets : iterable over dictionaries An iterable that provides parameter settings. Each element defines a parameter set, for which an estimation will be run using these parameters in estimate(X, **params). All other parameter settings will be taken from the default settings in the estimator object. evaluate : str or list of str, optional The given methods or properties will be called on the estimated models, and their results will be returned instead of the full models. This may be useful for reducing memory overhead. evaluate_args: iterable of iterable, optional Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate. failfast : bool If True, will raise an exception when estimation failed with an exception or trying to calls a method that doesn't exist. If False, will simply return None in these cases. return_estimators: bool If True, return a list estimators in addition to the models. show_progress: bool if the given estimator supports show_progress interface, we set the flag prior doing estimations. Return ------ models : list of model objects or evaluated function values A list of estimated models in the same order as param_sets. If evaluate is given, each element will contain the results from these method evaluations. estimators (optional) : list of estimator objects. These are returned only if return_estimators=True Examples -------- Estimate a maximum likelihood Markov model at lag times 1, 2, 3. >>> from pyemma.msm.estimators import MaximumLikelihoodMSM, BayesianMSM >>> >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2] # mini-trajectory >>> param_sets=param_grid({'lag': [1,2,3]}) >>> >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales') [array([ 1.24113168, 0.77454377]), array([ 2.48226337, 1.54908754]), array([ 3.72339505, 2.32363131])] Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False, ... evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP [[array([ 1.24113168, 0.77454377]), None], [array([ 2.48226337, 1.54908754]), None], [array([ 3.72339505, 2.32363131]), None]] We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example a Bayesian estimator for that. Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False, ... evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP [[array([ 1.24357685, 0.77609028]), [array([ 1.5963252 , 0.73877883]), array([ 1.29915847, 0.49004912]), array([ 0.90058583, 0.73841786]), ... ]] """ # make sure we have an estimator object estimator = get_estimator(estimator) if hasattr(estimator, 'show_progress'): estimator.show_progress = show_progress # if we want to return estimators, make clones. Otherwise just copy references. # For parallel processing we always need clones if return_estimators or n_jobs > 1 or n_jobs is None: estimators = [clone_estimator(estimator) for _ in param_sets] else: estimators = [estimator for _ in param_sets] # if we evaluate, make sure we have a list of functions to evaluate if _types.is_string(evaluate): evaluate = [evaluate] if _types.is_string(evaluate_args): evaluate_args = [evaluate_args] if evaluate is not None and evaluate_args is not None and len( evaluate) != len(evaluate_args): raise ValueError( "length mismatch: evaluate ({}) and evaluate_args ({})".format( len(evaluate), len(evaluate_args))) # set call back for joblib if progress_reporter is not None and show_progress: progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) if n_jobs > 1: try: from joblib.parallel import BatchCompletionCallBack batch_comp_call_back = True except ImportError: from joblib.parallel import CallBack as BatchCompletionCallBack batch_comp_call_back = False class CallBack(BatchCompletionCallBack): def __init__(self, *args, **kw): self.reporter = progress_reporter super(CallBack, self).__init__(*args, **kw) def __call__(self, *args, **kw): self.reporter._progress_update(1, stage=0) super(CallBack, self).__call__(*args, **kw) import joblib.parallel if batch_comp_call_back: joblib.parallel.BatchCompletionCallBack = CallBack else: joblib.parallel.CallBack = CallBack else: def _print(msg, msg_args): # NOTE: this is a ugly hack, because if we only use one job, # we do not get the joblib callback interface, as a workaround # we use the Parallel._print function, which is called with # msg_args = (done_jobs, total_jobs) if len(msg_args) == 2: progress_reporter._progress_update(1, stage=0) # iterate over parameter settings from joblib import Parallel import joblib, mock, six if six.PY34: from multiprocessing import get_context try: ctx = get_context(method='forkserver') except ValueError: # forkserver NA try: # this is slower in creation, but will not use as much memory! ctx = get_context(method='spawn') except ValueError: ctx = get_context(None) print( "WARNING: using default multiprocessing start method {}. " "This could potentially lead to memory issues.".format( ctx)) with mock.patch('joblib.parallel.DEFAULT_MP_CONTEXT', ctx): pool = Parallel(n_jobs=n_jobs) else: pool = Parallel(n_jobs=n_jobs) if progress_reporter is not None and n_jobs == 1: pool._print = _print # NOTE: verbose has to be set, otherwise our print hack does not work. pool.verbose = 50 if n_jobs > 1: # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator task_iter = (joblib.delayed(_estimate_param_scan_worker)( estimators[i], param_sets[i], X, evaluate, evaluate_args, failfast, ) for i in range(len(param_sets))) # container for model or function evaluations res = pool(task_iter) else: res = [] for i, param in enumerate(param_sets): res.append( _estimate_param_scan_worker(estimators[i], param, X, evaluate, evaluate_args, failfast)) if progress_reporter is not None and show_progress: progress_reporter._progress_update(1, stage=0) if progress_reporter is not None and show_progress: progress_reporter._progress_force_finish(0) # done if return_estimators: return res, estimators else: return res