def test_describe(self): for c in self.cl: desc = c.describe() assert types.is_string(desc) or types.is_list_of_string(desc)
def test_describe(self): desc = self.tica_obj.describe() assert types.is_string(desc) or types.is_list_of_string(desc) # describe on empty estimator tica(lag=1).describe()
def _parse_groupwise_input(group_definitions, group_pairs, MDlogger, mname=''): r"""For input of group type (add_group_mindist), prepare the array of pairs of indices and groups so that :py:func:`MinDistanceFeature` can work This function will: - check the input types - sort the 1D arrays of each entry of group_definitions - check for duplicates within each group_definition - produce the list of pairs for all needed distances - produce a list that maps each entry in the pairlist to a given group of distances Returns -------- parsed_group_definitions: list List of of 1D arrays containing sorted, unique atom indices parsed_group_pairs: numpy.ndarray (N,2)-numpy array containing pairs of indices that represent pairs of groups for which the inter-group distance-pairs will be generated distance_pairs: numpy.ndarray (M,2)-numpy array with all the distance-pairs needed (regardless of their group) group_membership: numpy.ndarray (N,2)-numpy array mapping each pair in distance_pairs to their associated group pair """ assert isinstance( group_definitions, list), "group_definitions has to be of type list, not %s" % type( group_definitions) # Handle the special case of just one group if len(group_definitions) == 1: group_pairs = np.array([0, 0], ndmin=2) # Sort the elements within each group parsed_group_definitions = [] for igroup in group_definitions: assert np.ndim( igroup ) == 1, "The elements of the groups definition have to be of dim 1, not %u" % np.ndim( igroup) parsed_group_definitions.append(np.unique(igroup)) # Check for group duplicates for ii, igroup in enumerate(parsed_group_definitions[:-1]): for jj, jgroup in enumerate(parsed_group_definitions[ii + 1:]): if len(igroup) == len(jgroup): assert not np.allclose( igroup, jgroup ), "Some group definitions appear to be duplicated, e.g %u and %u" % ( ii, ii + jj + 1) # Create and/or check the pair-list if is_string(group_pairs): if group_pairs == 'all': parsed_group_pairs = combinations( np.arange(len(group_definitions)), 2) else: assert isinstance(group_pairs, np.ndarray) assert group_pairs.shape[1] == 2 assert group_pairs.max() <= len(parsed_group_definitions), "Cannot ask for group nr. %u if group_definitions only " \ "contains %u groups"%(group_pairs.max(), len(parsed_group_definitions)) assert group_pairs.min( ) >= 0, "Group pairs contains negative group indices" parsed_group_pairs = np.zeros_like(group_pairs, dtype='int') for ii, ipair in enumerate(group_pairs): if ipair[0] == ipair[1]: MDlogger.warning( "%s will compute the mindist of group %u with itself. Is this wanted? " % (mname, ipair[0])) parsed_group_pairs[ii, :] = np.sort(ipair) # Create the large list of distances that will be computed, and an array containing group identfiers # of the distances that actually characterize a pair of groups distance_pairs = [] group_membership = np.zeros_like(parsed_group_pairs) b = 0 for ii, pair in enumerate(parsed_group_pairs): if pair[0] != pair[1]: distance_pairs.append( product(parsed_group_definitions[pair[0]], parsed_group_definitions[pair[1]])) else: parsed = parsed_group_definitions[pair[0]] distance_pairs.append(combinations(parsed, 2)) group_membership[ii, :] = [b, b + len(distance_pairs[ii])] b += len(distance_pairs[ii]) return parsed_group_definitions, parsed_group_pairs, np.vstack( distance_pairs), group_membership
def test_describe(self): desc = self.pca_obj.describe() assert types.is_string(desc) or types.is_list_of_string(desc)
def add_residue_mindist(self, residue_pairs='all', scheme='closest-heavy', ignore_nonprotein=True, threshold=None, periodic=True, count_contacts=False): r""" Adds the minimum distance between residues to the feature list. See below how the minimum distance can be defined. If the topology generated out of :py:obj:`topfile` contains information on periodic boundary conditions, the minimum image convention will be used when computing distances. Parameters ---------- residue_pairs : can be of two types: 'all' Computes distances between all pairs of residues excluding first and second neighbors ndarray((n, 2), dtype=int): n x 2 array with the pairs residues for which distances will be computed scheme : 'ca', 'closest', 'closest-heavy', default is closest-heavy Within a residue, determines the sub-group atoms that will be considered when computing distances ignore_nonprotein : boolean, default True Ignore residues that are not of protein type (e.g. water molecules, post-traslational modifications etc) threshold : float, optional, default is None distances below this threshold (in nm) will result in a feature 1.0, distances above will result in 0.0. If left to None, the numerical value will be returned periodic : bool, optional, default = True If `periodic` is True and the trajectory contains unitcell information, we will treat dihedrals that cross periodic images using the minimum image convention. count_contacts : bool, optional, default = False If set to True, this feature will return the number of formed contacts (and not feature values with either 1.0 or 0). The ouput of this feature will be of shape (Nt,1), and not (Nt, nr_of_contacts). Requires threshold to be set. .. note:: Using :py:obj:`scheme` = 'closest' or 'closest-heavy' with :py:obj:`residue pairs` = 'all' will compute nearly all interatomic distances, for every frame, before extracting the closest pairs. This can be very time consuming. Those schemes are intended to be used with a subset of residues chosen via :py:obj:`residue_pairs`. """ from .distances import ResidueMinDistanceFeature if scheme != 'ca' and is_string(residue_pairs): if residue_pairs == 'all': self.logger.warning("Using all residue pairs with schemes like closest or closest-heavy is " "very time consuming. Consider reducing the residue pairs") f = ResidueMinDistanceFeature(self.topology, residue_pairs, scheme, ignore_nonprotein, threshold, periodic, count_contacts=count_contacts) self.__add_feature(f)
def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=None, failfast=True, return_estimators=False, n_jobs=1, progress_reporter=None, show_progress=True, return_exceptions=False): """ Runs multiple estimations using a list of parameter settings Parameters ---------- estimator : Estimator object or class An estimator object that provides an estimate(X, **params) function. If only a class is provided here, the Estimator objects will be constructed with default parameter settings, and the parameter settings from param_sets for each estimation. If you want to specify other parameter settings for those parameters not specified in param_sets, construct an Estimator before and pass the object. param_sets : iterable over dictionaries An iterable that provides parameter settings. Each element defines a parameter set, for which an estimation will be run using these parameters in estimate(X, **params). All other parameter settings will be taken from the default settings in the estimator object. evaluate : str or list of str, optional The given methods or properties will be called on the estimated models, and their results will be returned instead of the full models. This may be useful for reducing memory overhead. evaluate_args: iterable of iterable, optional Arguments to be passed to evaluated methods. Note, that size has to match to the size of evaluate. failfast : bool If True, will raise an exception when estimation failed with an exception or trying to calls a method that doesn't exist. If False, will simply return None in these cases. return_estimators: bool If True, return a list estimators in addition to the models. show_progress: bool if the given estimator supports show_progress interface, we set the flag prior doing estimations. return_exceptions: bool, default=False if failfast is False while this setting is True, returns the exception thrown at the actual grid element, instead of None. Returns ------- models : list of model objects or evaluated function values A list of estimated models in the same order as param_sets. If evaluate is given, each element will contain the results from these method evaluations. estimators (optional) : list of estimator objects. These are returned only if return_estimators=True Examples -------- Estimate a maximum likelihood Markov model at lag times 1, 2, 3. >>> from pyerna.msm.estimators import MaximumLikelihoodMSM, BayesianMSM >>> >>> dtraj = [0,0,1,2,1,0,1,0,1,2,2,0,0,0,1,1,2,1,0,0,1,2,1,0,0,0,1,1,0,1,2] # mini-trajectory >>> param_sets=param_grid({'lag': [1,2,3]}) >>> >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, evaluate='timescales') [array([ 1.24113168, 0.77454377]), array([ 2.65266698, 1.42909842]), array([ 5.34810405, 1.14784446])] Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(MaximumLikelihoodMSM, dtraj, param_sets, failfast=False, ... evaluate=['timescales', 'timescales_samples']) # doctest: +SKIP [[array([ 1.24113168, 0.77454377]), None], [array([ 2.48226337, 1.54908754]), None], [array([ 3.72339505, 2.32363131]), None]] We get Nones because the MaximumLikelihoodMSM estimator doesn't provide timescales_samples. Use for example a Bayesian estimator for that. Now we also want to get samples of the timescales using the BayesianMSM. >>> estimate_param_scan(BayesianMSM, dtraj, param_sets, show_progress=False, ... evaluate=['timescales', 'sample_f'], evaluate_args=((), ('timescales', ))) # doctest: +SKIP [[array([ 1.24357685, 0.77609028]), [array([ 1.5963252 , 0.73877883]), array([ 1.29915847, 0.49004912]), array([ 0.90058583, 0.73841786]), ... ]] """ # make sure we have an estimator object estimator = get_estimator(estimator) if hasattr(estimator, 'show_progress'): estimator.show_progress = show_progress if n_jobs is None: from pyerna._base.parallel import get_n_jobs n_jobs = get_n_jobs(logger=getattr(estimator, 'logger', None)) # if we want to return estimators, make clones. Otherwise just copy references. # For parallel processing we always need clones. # Also if the Estimator is its own Model, we have to clone. from pyerna._base.model import Model if (return_estimators or n_jobs > 1 or n_jobs is None or isinstance(estimator, Model)): estimators = [clone_estimator(estimator) for _ in param_sets] else: estimators = [estimator for _ in param_sets] # only show progress of parameter study. if hasattr(estimators[0], 'show_progress'): for e in estimators: e.show_progress = False # if we evaluate, make sure we have a list of functions to evaluate if _types.is_string(evaluate): evaluate = [evaluate] if _types.is_string(evaluate_args): evaluate_args = [evaluate_args] if evaluate is not None and evaluate_args is not None and len( evaluate) != len(evaluate_args): raise ValueError( "length mismatch: evaluate ({}) and evaluate_args ({})".format( len(evaluate), len(evaluate_args))) logger_available = hasattr(estimators[0], 'logger') if logger_available: logger = estimators[0].logger if progress_reporter is None: from unittest.mock import MagicMock ctx = progress_reporter = MagicMock() callback = None else: ctx = progress_reporter._progress_context('param-scan') callback = lambda _: progress_reporter._progress_update( 1, stage='param-scan') progress_reporter._progress_register(len(estimators), stage='param-scan', description="estimating %s" % str(estimator.__class__.__name__)) # TODO: test on win, osx if n_jobs > 1 and os.name == 'posix': if logger_available: logger.debug('estimating %s with n_jobs=%s', estimator, n_jobs) # iterate over parameter settings task_iter = ((estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions) for estimator, param_set in zip(estimators, param_sets)) from pathos.multiprocessing import Pool pool = Pool(processes=n_jobs) args = list(task_iter) from contextlib import closing def error_callback(*args, **kw): if failfast: # TODO: can we be specific here? eg. obtain the stack of the actual process or is this the master proc? raise Exception('something failed') with closing(pool), ctx: res_async = [ pool.apply_async(_estimate_param_scan_worker, a, callback=callback, error_callback=error_callback) for a in args ] res = [x.get() for x in res_async] # if n_jobs=1 don't invoke the pool, but directly dispatch the iterator else: if logger_available: logger.debug( 'estimating %s with n_jobs=1 because of the setting or ' 'you not have a POSIX system', estimator) res = [] with ctx: for estimator, param_set in zip(estimators, param_sets): res.append( _estimate_param_scan_worker(estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions)) if progress_reporter is not None: progress_reporter._progress_update(1, stage='param-scan') # done if return_estimators: return res, estimators else: return res
def _estimate_param_scan_worker(estimator, params, X, evaluate, evaluate_args, failfast, return_exceptions): """ Method that runs estimation for several parameter settings. Defined as a worker for parallelization """ # run estimation model = None try: # catch any exception estimator.estimate(X, **params) model = estimator.model except KeyboardInterrupt: # we want to be able to interactively interrupt the worker, no matter of failfast=False. raise except: e = sys.exc_info()[1] if isinstance(estimator, Loggable): estimator.logger.warning("Ignored error during estimation: %s" % e) if failfast: raise # re-raise elif return_exceptions: model = e else: pass # just return model=None # deal with results res = [] # deal with result if evaluate is None: # we want full models res.append(model) # we want to evaluate function(s) of the model elif _types.is_iterable(evaluate): values = [] # the function values the model for ieval, name in enumerate(evaluate): # get method/attribute name and arguments to be evaluated #name = evaluate[ieval] args = () if evaluate_args is not None: args = evaluate_args[ieval] # wrap single arguments in an iterable again to pass them. if _types.is_string(args): args = (args, ) # evaluate try: # try calling method/property/attribute value = _call_member(estimator.model, name, failfast, *args) # couldn't find method/property/attribute except AttributeError as e: if failfast: raise e # raise an AttributeError else: value = None # we just ignore it and return None values.append(value) # if we only have one value, unpack it if len(values) == 1: values = values[0] res.append(values) else: raise ValueError('Invalid setting for evaluate: ' + str(evaluate)) if len(res) == 1: res = res[0] return res
def test_describe(self): c = self.ass desc = c.describe() assert types.is_string(desc) or types.is_list_of_string(desc)