示例#1
0
    def handle_model(self):
        """Creates a model is model is not set. Loads a model from a string. Or assign a model to self.model.out_folder

        Calling this function results in self.model to be and htmd.model.Model class
        """
        from htmd.model import Model
        from htmd.molecule.molecule import Molecule

        if not self.model:
            from IDP_htmd.IDP_analysis import analyze_folder
            print("Creating new analysis")
            self.write_parameters()
            self.model = analyze_folder(self.input_folder, self.out_folder, self.skip, self.metrics, self.cluster,
                self.tica, self.ticadim, self.ticalag, self.modellag, self.modelunits, self.macronum, self.bulk_split, 
                self.fes, self.rg_analysis, self.save_model, self.data_fstep)

        if isinstance(self.model, str):
            try:
                print("Loading model")
                model = Model()
                model.load(self.model)
                self.model = model
            except:
                print("Could not load the model")
                return

        if isinstance(self.model, Model):
            print("Model loaded")

        self.mol = Molecule(self.model.data.simlist[0].molfile)
示例#2
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        self._model = Model(datadr)
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        relFrames = self._getSpawnFrames(self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
示例#3
0
 def _createMSM(self, data):
     data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         if not path.exists('saveddata'):
             makedirs('saveddata')
         self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
示例#4
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        datalist = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')),
                           glob(path.join(self.inputpath, '*', '')))
        filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel)

        if hasattr(self, 'metricsel2') and self.metricsel2 is not None:
            proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype)
        else:
            proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype)
        metr = Metric(filtlist, skip=self.skip)
        metr.projection(proj)
        data = metr.project()

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        data.dropTraj()
        if self.ticadim > 0:
            tica = TICA(data, int(max(2, np.ceil(20/self.skip))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = data

        K = int(max(np.round(0.6 * np.log10(datadr.numFrames/1000)*1000+50), 100))  # heuristic
        if K > datadr.numFrames / 3: # Freaking ugly patches ...
            K = int(datadr.numFrames / 3)

        datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5)
        replacement = False
        if datadr.K < 10:
            datadr.cluster(self.clustmethod(n_clusters=K))
            replacement = True

        model = Model(datadr)
        macronum = self.macronum
        if datadr.K < macronum:
            macronum = np.ceil(datadr.K / 2)
            logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum))

        from pyemma.msm import timescales_msm
        timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))

        model.markovModel(self.lag, macronum)
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax-self.running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement)
        logger.debug('relFrames {}'.format(relFrames))

        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
示例#5
0
 def _createMSM(self, data):
     from htmd.model import Model
     kmeanserror = True
     while kmeanserror:
         try:
             data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
         except IndexError:
             continue
         kmeanserror = False
         
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         makedirs('saveddata', exist_ok=True)
         self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
示例#6
0
 def _createMSM(self, data):
     data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         if not path.exists('saveddata'):
             makedirs('saveddata')
         self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
示例#7
0
def scan_clusters(model, nclusters, out_dir):
    """Create models 
    
    In order to assess the effect on timescales using different clusters in a model.
    Parameters
    ----------
    model : htmd.model.Model
        Model class we want to perfom the analysis
    nclusters : int[]
        Array of clusters to be tested
    out_dir : str
        Directory to save the generated plots
    """
    from sklearn.cluster import MiniBatchKMeans
    for i in nclusters:
        model.data.cluster(MiniBatchKMeans(n_clusters=i), mergesmall=5)
        new_mod = Model(model.data)
        new_mod.plotTimescales(plot=False, save=f"{out_dir}/1_its-{i}_clu")
示例#8
0
def bootstrap(model, rounds, fraction=0.8, clusters=500):
    from htmd.model import Model
    from sklearn.cluster import MiniBatchKMeans

    for boot_round in range(rounds):
        dataBoot = model.data.bootstrap(fraction)
        print(f"Starting a new round of bootstrap - {boot_round}")
        dataBoot.cluster(MiniBatchKMeans(n_clusters=clusters), mergesmall=5)
        b_model = Model(dataBoot)
        yield (b_model)
示例#9
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        model = Model(datadr)
        self._model = model
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        # Undirected component
        uc = -model.data.N  # Lower counts should give higher score hence the -
        if self.statetype == 'micro':
            uc = uc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            uc = macroAccumulate(model, uc[model.cluster_ofmicro])

        # Calculating the directed component
        dc = self._calculateDirectedComponent(sims, model.data.St,
                                              model.data.N)
        if self.statetype == 'micro':
            dc = dc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            dc = macroAccumulate(model, dc[model.cluster_ofmicro])

        uc = self._featScale(uc)
        dc = self._featScale(dc)

        reward = dc + self.ucscale * uc

        relFrames = self._getSpawnFrames(reward, self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
示例#10
0
def viewModel(model_name):
    model = Model(file=model_name)
    try:
        model.macronum
    except:
        model.markovModel(20, 5, units="ns")

    model.viewStates(alignsel="noh and resname MOL",
                     protein=True,
                     ligand="protein and backbone")
示例#11
0
class AdaptiveMD(AdaptiveBase):
    """ Adaptive class which uses a Markov state model for respawning

    AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all
    currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using
    the discretized trajectories. From the Markov model it then chooses conformations from the various states based on
    the chosen criteria which will be used for starting new simulations.

    Parameters
    ----------
    app : :class:`SimQueue <htmd.queues.simqueue.SimQueue>` object, default=None
        A SimQueue class object used to retrieve and submit simulations
    project : str, default='adaptive'
        The name of the project
    nmin : int, default=1
        Minimum number of running simulations
    nmax : int, default=1
        Maximum number of running simulations
    nepochs : int, default=1000
        Stop adaptive once we have reached this number of epochs
    nframes : int, default=0
        Stop adaptive once we have simulated this number of aggregate simulation frames.
    inputpath : str, default='input'
        The directory used to store input folders
    generatorspath : str, default='generators'
        The directory containing the generators
    dryrun : boolean, default=False
        A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations
    updateperiod : float, default=0
        When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds
    coorname : str, default='input.coor'
        Name of the file containing the starting coordinates for the new simulations
    lock : bool, default=False
        Lock the folder while adaptive is ongoing
    datapath : str, default='data'
        The directory in which the completed simulations are stored
    filter : bool, default=True
        Enable or disable filtering of trajectories.
    filtersel : str, default='not water'
        Atom selection string for filtering.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__
    filteredpath : str, default='filtered'
        The directory in which the filtered simulations will be stored
    projection : :class:`Projection <moleculekit.projections.projection.Projection>` object, default=None
        A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model
    truncation : str, default=None
        Method for truncating the prob distribution (None, 'cumsum', 'statecut'
    statetype : ('micro', 'cluster', 'macro'), str, default='micro'
        What states (cluster, micro, macro) to use for calculations.
    macronum : int, default=8
        The number of macrostates to produce
    skip : int, default=1
        Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame
    lag : int, default=1
        The lagtime used to create the Markov model
    clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'htmd.clustering.kcenters.KCenter'>
        Clustering algorithm used to cluster the contacts or distances
    method : str, default='1/Mc'
        Criteria used for choosing from which state to respawn from
    ticalag : int, default=20
        Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.
    ticadim : int, default=3
        Number of TICA dimensions to use. When set to 0 it disables TICA
    contactsym : str, default=None
        Contact symmetry
    save : bool, default=False
        Save the model generated

    Example
    -------
    >>> adapt = AdaptiveMD()
    >>> adapt.nmin = 2
    >>> adapt.nmax = 3
    >>> adapt.nepochs = 2
    >>> adapt.ticadim = 3
    >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()]
    >>> adapt.generatorspath = htmd.home()+'/data/dhfr'
    >>> adapt.app = AcemdLocal()
    >>> adapt.run()
    """
    def __init__(self):
        from sklearn.base import ClusterMixin
        from htmd.clustering.kcenters import KCenter
        from moleculekit.projections.projection import Projection
        super().__init__()
        self._arg(
            'datapath', 'str',
            'The directory in which the completed simulations are stored',
            'data', val.String())
        self._arg('filter', 'bool',
                  'Enable or disable filtering of trajectories.', True,
                  val.Boolean())
        self._arg('filtersel', 'str', 'Filtering atom selection', 'not water',
                  val.String())
        self._arg(
            'filteredpath', 'str',
            'The directory in which the filtered simulations will be stored',
            'filtered', val.String())
        self._arg(
            'projection',
            ':class:`Projection <moleculekit.projections.projection.Projection>` object',
            'A Projection class object or a list of objects which will be used to project the simulation '
            'data before constructing a Markov model',
            None,
            val.Object(Projection),
            nargs='+')
        self._arg(
            'truncation', 'str',
            'Method for truncating the prob distribution (None, \'cumsum\', \'statecut\'',
            None, val.String())
        self._arg(
            'statetype',
            'str',
            'What states (cluster, micro, macro) to use for calculations.',
            'micro',
            val.String(),
            valid_values=('micro', 'cluster', 'macro'))
        self._arg('macronum', 'int', 'The number of macrostates to produce', 8,
                  val.Number(int, 'POS'))
        self._arg(
            'skip', 'int',
            'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame',
            1, val.Number(int, 'POS'))
        self._arg('lag', 'int', 'The lagtime used to create the Markov model',
                  1, val.Number(int, 'POS'))
        self._arg(
            'clustmethod',
            ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class',
            'Clustering algorithm used to cluster the contacts or distances',
            KCenter, val.Class(ClusterMixin))
        self._arg(
            'method', 'str',
            'Criteria used for choosing from which state to respawn from',
            '1/Mc', val.String())
        self._arg(
            'ticalag', 'int',
            'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.',
            20, val.Number(int, '0POS'))
        self._arg(
            'ticadim', 'int',
            'Number of TICA dimensions to use. When set to 0 it disables TICA',
            3, val.Number(int, '0POS'))
        self._arg('contactsym', 'str', 'Contact symmetry', None, val.String())
        self._arg('save', 'bool', 'Save the model generated', False,
                  val.Boolean())

    def _algorithm(self):
        data = self._getData(self._getSimlist())
        if not self._checkNFrames(data): return False
        self._createMSM(data)

        N = self.nmax - self._running
        reward = self._criteria(self._model, self.method)
        reward = self._truncate(reward, N)
        relFrames, _, _ = self._getSpawnFrames(reward, self._model,
                                               self._model.data, N)
        self._writeInputs(self._model.data.rel2sim(np.concatenate(relFrames)))
        return True

    def _checkNFrames(self, data):
        if self.nframes != 0 and data.numFrames >= self.nframes:
            logger.info('Reached maximum number of frames. Stopping adaptive.')
            return False
        return True

    def _getSimlist(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', '')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)
        return sims

    def _getData(self, sims):
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            data = metr.project()
            data.dropTraj()  # Drop before TICA to avoid broken trajectories
            ticalag = int(
                np.ceil(max(2,
                            min(np.min(data.trajLengths) / 2,
                                self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()
        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        return datadr

    def _createMSM(self, data):
        data.cluster(
            self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
        self._model = Model(data)
        self._model.markovModel(self.lag, self._numMacrostates(data))
        if self.save:
            if not path.exists('saveddata'):
                makedirs('saveddata')
            self._model.save(
                path.join('saveddata',
                          'e{}_adapt_model.dat'.format(self._getEpoch())))

    def _getSpawnFrames(self, reward, model, data, N):
        prob = reward / np.sum(reward)
        logger.debug('Sampling probabilities {}'.format(prob))
        spawncounts = np.random.multinomial(N, prob)
        logger.debug('spawncounts {}'.format(spawncounts))

        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx,
                                          spawncounts[stateIdx],
                                          statetype='micro',
                                          replacement=True)
        logger.debug('relFrames {}'.format(relFrames))
        return relFrames, spawncounts, prob

    def _criteria(self, model, criteria):
        if criteria == '1/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model,
                                      model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro]
        elif criteria == 'pi/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model,
                                      model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro] * model.msm.stationary_distribution
        return ret

    def _truncate(self, ranking, N):
        if self.truncation is not None and self.truncation.lower() != 'none':
            if self.truncation == 'cumsum':
                idx = np.argsort(ranking)
                idx = idx[::-1]  # decreasing sort
                errs = ranking[idx]
                H = (N * errs / np.cumsum(errs)) < 1
                ranking[idx[H]] = 0
            if self.truncation == 'statecut':
                idx = np.argsort(ranking)
                idx = idx[::-1]  # decreasing sort
                ranking[idx[N:]] = 0  # Set all states ranked > N to zero.
        return ranking

    def _numClusters(self, numFrames):
        """ Heuristic that calculates number of clusters from number of frames """
        K = int(
            max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50),
                100))  # heuristic
        if K > numFrames / 3:  # Ugly patch for low-data regimes ...
            K = int(numFrames / 3)
        return K

    def _numMacrostates(self, data):
        """ Heuristic for calculating the number of macrostates for the Markov model """
        macronum = self.macronum
        if data.K < macronum:
            macronum = np.ceil(data.K / 2)
            logger.warning(
                'Using less macrostates than requested due to lack of microstates. macronum = '
                + str(macronum))

        # Calculating how many timescales are above the lag time to limit number of macrostates
        from pyemma.msm import timescales_msm
        timesc = timescales_msm(data.St.tolist(), lags=self.lag,
                                nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))
        return macronum
示例#12
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        datalist = simlist(
            glob(path.join(self.datapath, '*', '')),
            glob(path.join(self.inputpath, '*', 'structure.pdb')),
            glob(path.join(self.inputpath, '*', '')))
        filtlist = simfilter(datalist,
                             self.filteredpath,
                             filtersel=self.filtersel)

        if hasattr(self, 'metricsel2') and self.metricsel2 is not None:
            proj = MetricDistance(self.metricsel1,
                                  self.metricsel2,
                                  metric=self.metrictype)
        else:
            proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype)
        metr = Metric(filtlist, skip=self.skip)
        metr.projection(proj)
        data = metr.project()

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        data.dropTraj()
        if self.ticadim > 0:
            tica = TICA(data, int(max(2, np.ceil(20 / self.skip))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = data

        K = int(
            max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50),
                100))  # heuristic
        if K > datadr.numFrames / 3:  # Freaking ugly patches ...
            K = int(datadr.numFrames / 3)

        datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5)
        replacement = False
        if datadr.K < 10:
            datadr.cluster(self.clustmethod(n_clusters=K))
            replacement = True

        model = Model(datadr)
        macronum = self.macronum
        if datadr.K < macronum:
            macronum = np.ceil(datadr.K / 2)
            logger.warning(
                'Using less macrostates than requested due to lack of microstates. macronum = '
                + str(macronum))

        from pyemma.msm import timescales_msm
        timesc = timescales_msm(datadr.St.tolist(),
                                lags=self.lag,
                                nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))

        model.markovModel(self.lag, macronum)
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx,
                                          spawncounts[stateIdx],
                                          statetype='micro',
                                          replacement=replacement)
        logger.debug('relFrames {}'.format(relFrames))

        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
示例#13
0
def analyze_folder(folder=None,
                   out_folder="/tmp",
                   skip=1,
                   metrics=None,
                   clu=500,
                   tica=True,
                   ticadim=5,
                   tica_lag=20,
                   model_lag=10,
                   model_units='ns',
                   macro_N=10,
                   bulk_split=False,
                   fes=True,
                   rg_analysis=True,
                   save=True,
                   data_fstep=None):
    """Analysis script for create a Markov State Model
    
    Creates and returns a Markov State Model given a data folder.
    Intented to follow up the evolution of an adaptive sampling run.
    Allows to save the model ans several informative plots
    
    Parameters
    ----------
    folder : str
        Data folder where adaptive is running
    out_folder : str
        Output folder to store derived data
    skip : int
        Number of frames to skip while projecting the MD data
    metrics : [:class: `Metric` object]
        Metric array used to project the data
    clu : int
        Number of cluster to create using the MiniBatchKMeans method.
    tica: bool
        Wether to use TICA of GWPCA for dimensionality reduction
    ticadim : int
        Number of TICA dimension to project the data. If None, the model will be created using the raw projected data
    tica_lag : int, optional
        Description
    model_lag : int
        Number of ns used to create the model
    model_units : str, optional
        Description
    macro_N : int
        Number of macrostate to split the final Markov State Model
    fes : bool, optional
        If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined
    rg_analysis : bool, optional
        If true, a plot with information relative to the radious of gyration of the molecule will be created.
    save : bool, optional
        If true, the model will be saved in the outputs folder
    
    Returns
    -------
    :class:`Model`
        Final model
    """
    from htmd.model import Model
    from htmd.molecule.molecule import Molecule
    from htmd.simlist import simlist
    from htmd.projections.metric import Metric
    from sklearn.cluster import MiniBatchKMeans
    from IDP_htmd.IDP_model import plot_RG
    from IDP_htmd.model_utils import create_bulk
    from glob import glob
    import os

    try:
        os.mkdir(out_folder)
    except:
        print("Folder already exists")

    try:
        fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True)
        print(f"Loaded {folder}/simlist.npy")
    except:
        print("Creating simlist")
        sims = glob(folder + 'filtered/*/')
        fsims = simlist(sims, folder + 'filtered/filtered.pdb')
    metr = Metric(fsims, skip=skip)
    metr.set(metrics)

    #Check if this gives problems to ITS

    try:
        model = Model(file=f"{out_folder}/model.dat")
        out_data = model.data
        print(f"Loading model: {out_folder}/model.dat")
    except:
        if tica and ticadim:
            from htmd.projections.tica import TICA
            print("Projecting TICA")
            tica = TICA(metr, tica_lag)
            out_data = tica.project(ticadim)
        elif not tica and ticadim:
            from htmd.projections.gwpca import GWPCA
            data = metr.project()
            data.dropTraj()
            print("using GWPCA")
            gwpca = GWPCA(data, tica_lag)
            out_data = gwpca.project(ticadim)
        else:
            print("Not using TICA")
            data = metr.project()
            data.dropTraj()
            out_data = data

    #Avoid some possibles error while clustering
    if data_fstep: out_data.fstep = data_fstep
    x = True
    while x:
        try:
            out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5)
            x = False
        except Exception as e:
            raise Exception("Error " + str(e))

    model = Model(out_data)
    model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png")

    if macro_N:
        model.markovModel(model_lag, macro_N, units=model_units)

        if bulk_split:
            try:
                print("Starting bulk splitting")
                create_bulk(model, bulk_split)
            except Exception as e:
                print("Could not perform the bulk splitting")
                print(e)

        model.eqDistribution(plot=False,
                             save=f"{out_folder}/1.2_eqDistribution.png")

        if rg_analysis:
            from IDP_htmd.IDP_analysis import rg_analysis
            mol = Molecule(model.data.simlist[0].molfile)
            rg_data = rg_analysis(model, skip=skip)
            plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png")

        # if fes and ticadim:
        # model.plotFES(0, 1, temperature=310, states=True,
        #     plot=False, save=f"{out_folder}/1.3_fes.png")

    if save:
        model.save(f"{out_folder}/model.dat")

    return model
示例#14
0
    from htmd.model import Model

    mt = ModelAnalysis("/workspace8/excitome/adaptiveRun/O75376_MOR_58/",
        "/home/pablo/testModel/")

    mt.metrics = [
                MetricDistance(
                sel1="noh and protein",
                sel2="noh and protein",
                metric="contacts",
                threshold=5,
                groupsel1="residue",
                groupsel2="residue")
            ]

    model = Model()
    model.load("/home/pablo/testModel/model.dat")
    mt.model = model
    mt.handle_model()
    mt.sasa_variation()
    # mt.model = "/home/pablo/testModel/model.dat"
    # mt.plot_dihedral = "2_dihedral"
    # mt.macronum = 4
    # mt.plot_contacts = [
    #     ('all_contacts', 'noh and protein', 5),
    #     ('backbone', 'noh and backbone', 5),
    #     ('sidechain', 'noh and sidechain', 4),
    # ]
    # mt.write_parameters()
    # mt.generate_html_summary()
示例#15
0
    def _createMSM(self,
                   epoch,
                   output_folder,
                   basedata=None,
                   skip=1,
                   clusters=0,
                   ticadim=0,
                   ticalag=20,
                   macronum=2,
                   modellag=5,
                   modelunits="frames",
                   fstep=None,
                   data2combine=None):
        from htmd.projections.tica import TICA
        from sklearn.cluster import MiniBatchKMeans
        from htmd.model import Model

        try:
            model = Model(
                file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat"
            )

            if (model.macronum != macronum or model.lag != modellag):
                model.markovModel(modellag, macronum, units=modelunits)
            print("Model loaded")
        except:
            if not self.precalculated_data and not self.low_memory_usage:
                print("Calculating PRECALC DATA")
                precalc_data = self._precalculateData(self.precalc_metric,
                                                      self.input_folder,
                                                      fstep=fstep,
                                                      skip=skip)
                self.precalc_data = precalc_data
                self.precalculated_data = True

            if self.analysis_type == "epoch" and not self.low_memory_usage:
                epoch_sim = np.concatenate(
                    np.array([
                        self.epoch_sim_indexes[i] for i in range(1, epoch + 1)
                        if i in list(self.epoch_sim_indexes.keys())
                    ]))
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                drop_traj_idx[epoch_sim] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]
            elif self.analysis_type == "sims" and not self.low_memory_usage:
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                no_drop_idx = np.arange(1, epoch)
                drop_traj_idx[no_drop_idx] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]

            if not self.low_memory_usage:
                data = self.precalc_data.copy()
                data.dropTraj(idx=drop_idx)
                data.dropTraj()

            if basedata:
                from htmd.projections.metric import MetricData
                r_fit = self._fitBaseline(data, basedata)
                data = MetricData(dat=r_fit, simlist=data.simlist)
            elif ticadim and not self.low_memory_usage:
                tica = TICA(data, ticalag)
                data = tica.project(ticadim)
            elif ticadim and self.low_memory_usage:
                from htmd.projections.metric import Metric
                if self.analysis_type == "epoch":
                    epoch_sim = np.concatenate(
                        np.array([
                            self.epoch_sim_indexes[i]
                            for i in range(1, epoch + 1)
                            if i in list(self.epoch_sim_indexes.keys())
                        ]))
                else:
                    epoch_sim = range(0, epoch)
                metr = Metric(self._sims[epoch_sim], skip=skip)
                metr.set(self.precalc_metric)
                tica = TICA(metr, ticalag)
                data = tica.project(ticadim)
            if not clusters:
                clusters = self._numClusters(data.numFrames)

            if data2combine:
                try:
                    print("Adding extra dimension")
                    data2combine_copy = data2combine.copy()
                    data2combine_copy.dropTraj(keepsims=data.simlist)
                    data.combine(data2combine_copy)
                except Exception as e:
                    print("Could not combined data", str(e))

            data.cluster(MiniBatchKMeans(clusters), mergesmall=5)
            model = Model(data)
            model.markovModel(modellag, macronum, units=modelunits)
            model.save(
                f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat")

        for name, met in self.associated_metrics.items():
            try:
                self.associated_data[name]
            except:
                print(f"Calcualtion associted data - {name.upper()}")
                assoc_data = self._precalculateData(met,
                                                    self.input_folder,
                                                    fstep=fstep,
                                                    skip=skip)
                self.associated_data[name] = assoc_data

        for name, data in self.associated_data.items():
            tmp_data = data.copy()
            tmp_data.dropTraj(keepsims=model.data.simlist)
            self.tmp_associated_data[name] = tmp_data

        return model
示例#16
0
class AdaptiveBandit(AdaptiveBase):
    """

    Parameters
    ----------
    app : :class:`SimQueue <jobqueues.simqueue.SimQueue>` object, default=None
        A SimQueue class object used to retrieve and submit simulations
    project : str, default='adaptive'
        The name of the project
    nmin : int, default=1
        Minimum number of running simulations
    nmax : int, default=1
        Maximum number of running simulations
    nepochs : int, default=1000
        Stop adaptive once we have reached this number of epochs
    nframes : int, default=0
        Stop adaptive once we have simulated this number of aggregate simulation frames.
    inputpath : str, default='input'
        The directory used to store input folders
    generatorspath : str, default='generators'
        The directory containing the generators
    dryrun : boolean, default=False
        A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations
    updateperiod : float, default=0
        When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds
    coorname : str, default='input.coor'
        Name of the file containing the starting coordinates for the new simulations
    lock : bool, default=False
        Lock the folder while adaptive is ongoing
    datapath : str, default='data'
        The directory in which the completed simulations are stored
    filter : bool, default=True
        Enable or disable filtering of trajectories.
    filtersel : str, default='not water'
        Filtering atom selection
    filteredpath : str, default='filtered'
        The directory in which the filtered simulations will be stored
    projection : :class:`Projection <moleculekit.projections.projection.Projection>` object, default=None
        A Projection class object or a list of objects which will be used to project the simulation data before
        constructing a Markov model
    goalfunction : function, default=None
        This function will be used to convert the goal-projected simulation data to a ranking whichcan be used for the
        directed component of FAST.
    reward_method : str, default='max'
        The reward method
    skip : int, default=1
        Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame
    lag : int, default=1
        The lagtime used to create the Markov model. Units are in frames.
    exploration : float, default=0.5
        Exploration is the coefficient used in UCB algorithm to weight the exploration value
    temperature : int, default=300
        Temperature used to compute the free energy
    ticalag : int, default=20
        Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.
    ticadim : int, default=3
        Number of TICA dimensions to use. When set to 0 it disables TICA
    clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'>
        Clustering algorithm used to cluster the contacts or distances
    macronum : int, default=8
        The number of macrostates to produce
    save : bool, default=False
        Save the model generated
    save_qval : bool, default=False
        Save the Q(a) and N values for every epoch
    actionspace : str, default='metric'
        The action space
    recluster : bool, default=False
        If to recluster the action space.
    reclusterMethod : , default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'>
        Clustering method for reclustering.
    random : bool, default=False
        Random decision mode for baseline.
    reward_mode : str, default='parent'
        (parent, frame)
    reward_window : int, default=None
        The reward window
    pucb : bool, default=False
        If True, it uses PUCB algorithm using the provided goal function as a prior
    goal_init : float, default=0.3
        The proportional ratio of goal initialization compared to max frames set by nframes
    goal_preprocess : function, default=None
        This function will be used to preprocess goal data after it has been computed for all frames.
    actionpool : int, default=0
        The number of top scoring actions used to randomly select respawning simulations
    """
    def __init__(self):
        from sklearn.base import ClusterMixin
        from moleculekit.projections.projection import Projection
        super().__init__()
        self._arg('datapath', 'str', 'The directory in which the completed simulations are stored', 'data', val.String())
        self._arg('filter', 'bool', 'Enable or disable filtering of trajectories.', True, val.Boolean())
        self._arg('filtersel', 'str', 'Filtering atom selection', 'not water', val.String())
        self._arg('filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered', val.String())
        self._arg('projection', ':class:`Projection <moleculekit.projections.projection.Projection>` object',
                  'A Projection class object or a list of objects which will be used to project the simulation '
                   'data before constructing a Markov model', None, val.Object(Projection), nargs='+')
        self._arg('goalfunction', 'function',
                  'This function will be used to convert the goal-projected simulation data to a ranking which'
                  'can be used for the directed component of FAST.', None, val.Function(), nargs='any')
        self._arg('reward_method', 'str', 'The reward method', 'max', val.String())
        self._arg('skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, val.Number(int, 'POS'))
        self._arg('lag', 'int', 'The lagtime used to create the Markov model. Units are in frames.', 1, val.Number(int, 'POS'))
        self._arg('exploration', 'float', 'Exploration is the coefficient used in UCB algorithm to weight the exploration value', 0.5, val.Number(float, 'OPOS'))
        self._arg('temperature', 'int', 'Temperature used to compute the free energy', 300, val.Number(int, 'POS'))
        self._arg('ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, val.Number(int, '0POS'))
        self._arg('ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, val.Number(int, '0POS'))
        self._arg('clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class', 'Clustering algorithm used to cluster the contacts or distances', MiniBatchKMeans, val.Class(ClusterMixin))
        self._arg('macronum', 'int', 'The number of macrostates to produce', 8, val.Number(int, 'POS'))
        self._arg('save', 'bool', 'Save the model generated', False, val.Boolean())
        self._arg('save_qval', 'bool', 'Save the Q(a) and N values for every epoch', False, val.Boolean())
        self._arg('actionspace', 'str', 'The action space', 'metric', val.String())
        self._arg('recluster', 'bool', 'If to recluster the action space.', False, val.Boolean())
        self._arg('reclusterMethod', '', 'Clustering method for reclustering.', MiniBatchKMeans)
        self._arg('random', 'bool', 'Random decision mode for baseline.', False, val.Boolean())
        self._arg('reward_mode', 'str', '(parent, frame)', 'parent', val.String())
        self._arg('reward_window', 'int', 'The reward window', None, val.Number(int, 'POS'))
        self._arg('pucb', 'bool', 'If True, it uses PUCB algorithm using the provided goal function as a prior', False, val.Boolean())
        self._arg('goal_init', 'float', 'The proportional ratio of goal initialization compared to max frames set by nframes', 0.3, val.Number(float, 'POS'))
        self._arg('goal_preprocess', 'function',
                  'This function will be used to preprocess goal data after it has been computed for all frames.', None, val.Function(), nargs='any')
        self._arg('actionpool', 'int', 'The number of top scoring actions used to randomly select respawning simulations', 0, val.Number(int, 'OPOS'))

    def _algorithm(self):
        from htmd.kinetics import Kinetics
        sims = self._getSimlist()
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        data = metr.project()
        data.dropTraj()  # Drop before TICA to avoid broken trajectories

        if self.goalfunction is not None:
            goaldata = self._getGoalData(data.simlist)
            if len(data.simlist) != len(goaldata.simlist):
                raise RuntimeError('The goal function was not able to project all trajectories that the MSM projection could. Check for possible errors in the goal function.')
            goaldataconcat = np.concatenate(goaldata.dat)
            if self.save:
                makedirs('saveddata', exist_ok=True)
                goaldata.save(path.join('saveddata', 'e{}_goaldata.dat'.format(self._getEpoch())))

        # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
        if self.ticadim > 0:
            ticalag = int(np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datatica = tica.project(self.ticadim)
            if not self._checkNFrames(datatica): return False
            self._createMSM(datatica)
        else:
            if not self._checkNFrames(data): return False
            self._createMSM(data)

        confstatdist = self.conformationStationaryDistribution(self._model)
        if self.actionspace == 'metric':
            if not data.K:
                data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
            data_q = data.copy()
        elif self.actionspace == 'goal':
            data_q = goaldata.copy()
        elif self.actionspace == 'tica':
            data_q = datatica.copy()
        elif self.actionspace == 'ticapcca':
            data_q = datatica.copy()
            for traj in data_q.trajectories:
                traj.cluster = self._model.macro_ofcluster[traj.cluster]
            data_q.K = self._model.macronum

        if self.recluster:
            print('Reclustering with {}'.format(self.reclusterMethod))
            data_q.cluster(self.reclusterMethod)
        
        numstates = data_q.K
        print('Numstates: {}'.format(numstates))
        currepoch = self._getEpoch()
        q_values = np.zeros(numstates, dtype=np.float32)
        n_values = np.zeros(numstates, dtype=np.int32)

        if self.random:  # If random mode respawn from random action states
            action_sel = np.zeros(numstates, dtype=int)
            N = self.nmax - self._running
            randomactions = np.bincount(np.random.randint(numstates, size=N))
            action_sel[:len(randomactions)] = randomactions
            if self.save_qval:
                makedirs('saveddata', exist_ok=True)
                np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel)
            relFrames = self._getSpawnFrames_UCB(action_sel, data_q)
            self._writeInputs(data.rel2sim(np.concatenate(relFrames)))
            return True

        if self.goalfunction is not None:
            ## For every cluster in data_q, get the max score and initialize
            if self.goal_preprocess is not None:
                goaldataconcat = self.goal_preprocess(goaldataconcat)
            qstconcat = np.concatenate(data_q.St)
            statemaxes = np.zeros(numstates)
            np.maximum.at(statemaxes, qstconcat, np.squeeze(goaldataconcat))
            if not self.pucb:
                goalenergies = -Kinetics._kB * self.temperature * np.log(1-statemaxes)
                q_values = goalenergies
                n_values += int((self.nframes / self._numClusters(self.nframes)) * self.goal_init) ## Needs nframes to be set properly!!!!!!!!

        rewardtraj = np.arange(data_q.numTrajectories) # Recalculate reward for all states
        rewards = self.getRewards(rewardtraj, data_q, confstatdist, numstates, self.reward_method, self.reward_mode, self.reward_window)
        for i in range(numstates):
            if len(rewards[i]) == 0:
                continue
            q_values[i] = updatingMean(q_values[i], n_values[i], rewards[i])
        n_values += np.array([len(x) for x in rewards])


        if self.save_qval:
            makedirs('saveddata', exist_ok=True)
            np.save(path.join('saveddata', 'e{}_qval.npy'.format(currepoch)), q_values)
            np.save(path.join('saveddata', 'e{}_nval.npy'.format(currepoch)), n_values)

        
        if self.pucb:
            ucb_values = np.array([self.count_pucb(q_values[clust], self.exploration, statemaxes[clust], currepoch + 1, n_values[clust]) for clust in range(numstates)])
        else:
            ucb_values = np.array([self.count_ucb(q_values[clust], self.exploration, currepoch + 1, n_values[clust]) for clust in range(numstates)])

        if self.save_qval:
            makedirs('saveddata', exist_ok=True)
            np.save(path.join('saveddata', 'e{}_ucbvals.npy'.format(currepoch)), ucb_values)

        N = self.nmax - self._running
        if self.actionpool <= 0:
            self.actionpool = N
       
        topactions = np.argsort(-ucb_values)[:self.actionpool]
        action = np.random.choice(topactions, N, replace=False)

        action_sel = np.zeros(numstates, dtype=int)
        action_sel[action] += 1
        while np.sum(action_sel) < N:  # When K is lower than N repeat some actions
            for a in action:
                action_sel[a] +=1
                if np.sum(action_sel) == N:
                    break

        if self.save_qval:
            np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel)
        relFrames = self._getSpawnFrames_UCB(action_sel, data_q) 
        self._writeInputs(data.rel2sim(np.concatenate(relFrames)))
        return True

    def _getSimlist(self):
        from glob import glob
        from htmd.simlist import simlist, simfilter
        logger.info('Postprocessing new data')

        sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', '')),
                       glob(path.join(self.inputpath, '*', '')))

        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)
        return sims


    def count_ucb(self, q_value, exploration, step, n_value):
        return (q_value + (exploration * np.sqrt((np.log(step) / (n_value + 1)))))

    def count_pucb(self, q_value, exploration, predictor, step, n_value):
        return (q_value + (exploration * predictor * np.sqrt((np.log(step) / (n_value + 1)))))

    def getRewards(self, trajidx, data_q, confstatdist, numstates, rewardmethod, rewardmode, rewardwindow):
        from htmd.kinetics import Kinetics
        import pandas as pd
        rewards = [[] for _ in range(numstates)]
        for simidx in trajidx:
            # Get the eq distribution of each of the states the sim passed through
            states = data_q.St[simidx]
            statprob = confstatdist[simidx]
            connected = (states != -1) & (statprob != 0)
            if not np.any(connected):
                continue
            states = states[connected]
            statprob = statprob[connected]
            #energies = Kinetics._kB * self.temperature * np.log(statprob)
            energies = -Kinetics._kB * self.temperature * np.log(1-statprob)
            ww = rewardwindow
            if rewardwindow is None:
                ww = len(energies)

            if rewardmethod == 'mean':
                windowedreward = pd.Series(energies[::-1]).rolling(ww, min_periods=1).mean().values[::-1]
            elif rewardmethod == 'max':
                windowedreward = pd.Series(energies[::-1]).rolling(ww, min_periods=1).max().values[::-1]
            else:
                raise RuntimeError('Reward method {} not available'.format(rewardmethod))

            if rewardmode == 'parent':
                # Get the state of the conformation from which the sim was spawned
                parentidx, parentframe = getParentSimIdxFrame(data_q, simidx)
                if parentidx == -1:  # Parent frame doesn't belong to any state
                    print('Parent frame doesn\'t belong to any state')
                    continue
                prev_action = data_q.St[parentidx][parentframe]
                rewards[prev_action].append(windowedreward[0])
            elif rewardmode == 'frames':
                for st, re in zip(states, windowedreward):
                    rewards[st].append(re)
            else:
                raise RuntimeError('Invalid reward mode {}'.format(rewardmode))

        return rewards

    def conformationStationaryDistribution(self, model):
        statdist = np.zeros(model.data.numFrames) # zero for disconnected set
        dataconcatSt = np.concatenate(model.data.St)
        for i in range(model.micronum):
            microframes = np.where(model.micro_ofcluster[dataconcatSt] == i)[0]
            statdist[microframes] = model.msm.stationary_distribution[i]
        return model.data.deconcatenate(statdist)

    def _checkNFrames(self, data):
        if self.nframes != 0 and data.numFrames >= self.nframes:
            logger.info('Reached maximum number of frames. Stopping adaptive.')
            return False
        return True

    def _getGoalData(self, sims):
        from htmd.projections.metric import Metric
        logger.debug('Starting projection of directed component')
        metr = Metric(sims, skip=self.skip)
        metr.set(self.goalfunction)
        data = metr.project()
        logger.debug('Finished calculating directed component')
        return data

    def _createMSM(self, data):
        from htmd.model import Model
        kmeanserror = True
        while kmeanserror:
            try:
                data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
            except IndexError:
                continue
            kmeanserror = False
            
        self._model = Model(data)
        self._model.markovModel(self.lag, self._numMacrostates(data))
        if self.save:
            makedirs('saveddata', exist_ok=True)
            self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))

    def _getSpawnFrames_UCB(self, reward, data):
        stateIdx = np.where(reward > 0)[0]
        _, relFrames = data.sampleClusters(stateIdx, reward[stateIdx], replacement=True, allframes=False)
        logger.debug('relFrames {}'.format(relFrames))
        return relFrames

    def _numClusters(self, numFrames):
        """ Heuristic that calculates number of clusters from number of frames """
        K = int(max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100))  # heuristic
        if K > numFrames / 3:  # Ugly patch for low-data regimes ...
            K = int(numFrames / 3)
        return K

    def _numMacrostates(self, data):
        """ Heuristic for calculating the number of macrostates for the Markov model """
        macronum = self.macronum
        if data.K < macronum:
            macronum = np.ceil(data.K / 2)
            logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum))

        # Calculating how many timescales are above the lag time to limit number of macrostates
        from pyemma.msm import timescales_msm
        timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))
        return macronum
示例#17
0
    return ani


if __name__ == "__main__":
    from htmd.model import getStateStatistic
    from htmd.projections.metric import MetricData
    from htmd.projections.metricdistance import MetricDistance
    from htmd.model import Model
    from htmd.molecule.molecule import Molecule
    import numpy as np

    data = MetricData()
    data.load(
        "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/testing.dat"
    )
    model = Model()
    model.load(
        "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/model.dat"
    )
    mol = Molecule(model.data.simlist[0].molfile)
    mean_dat = getStateStatistic(model, data, range(model.macronum))
    met = MetricDistance(sel1="noh and protein or resname MOL",
                         sel2="noh and protein or resname MOL",
                         groupsel1="residue",
                         groupsel2="residue",
                         metric="distances",
                         pbc=False)
    mapping = met.getMapping(mol)
    contact_plot(mean_dat,
                 mol,
                 rows=2,
示例#18
0
class AdaptiveMD(AdaptiveBase):
    """ Adaptive class which uses a Markov state model for respawning

    AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all
    currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using
    the discretized trajectories. From the Markov model it then chooses conformations from the various states based on
    the chosen criteria which will be used for starting new simulations.

    Parameters
    ----------
    app : :class:`SimQueue <htmd.queues.simqueue.SimQueue>` object, default=None
        A SimQueue class object used to retrieve and submit simulations
    project : str, default='adaptive'
        The name of the project
    nmin : int, default=1
        Minimum number of running simulations
    nmax : int, default=1
        Maximum number of running simulations
    nepochs : int, default=1000
        Stop adaptive once we have reached this number of epochs
    nframes : int, default=0
        Stop adaptive once we have simulated this number of aggregate simulation frames.
    inputpath : str, default='input'
        The directory used to store input folders
    generatorspath : str, default='generators'
        The directory containing the generators
    dryrun : boolean, default=False
        A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations
    updateperiod : float, default=0
        When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds
    coorname : str, default='input.coor'
        Name of the file containing the starting coordinates for the new simulations
    lock : bool, default=False
        Lock the folder while adaptive is ongoing
    datapath : str, default='data'
        The directory in which the completed simulations are stored
    filter : bool, default=True
        Enable or disable filtering of trajectories.
    filtersel : str, default='not water'
        Atom selection string for filtering.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__
    filteredpath : str, default='filtered'
        The directory in which the filtered simulations will be stored
    projection : :class:`Projection <htmd.projections.projection.Projection>` object, default=None
        A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model
    truncation : str, default=None
        Method for truncating the prob distribution (None, 'cumsum', 'statecut'
    statetype : ('micro', 'cluster', 'macro'), str, default='micro'
        What states (cluster, micro, macro) to use for calculations.
    macronum : int, default=8
        The number of macrostates to produce
    skip : int, default=1
        Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame
    lag : int, default=1
        The lagtime used to create the Markov model
    clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` class, default=<class 'htmd.clustering.kcenters.KCenter'>
        Clustering algorithm used to cluster the contacts or distances
    method : str, default='1/Mc'
        Criteria used for choosing from which state to respawn from
    ticalag : int, default=20
        Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.
    ticadim : int, default=3
        Number of TICA dimensions to use. When set to 0 it disables TICA
    contactsym : str, default=None
        Contact symmetry
    save : bool, default=False
        Save the model generated

    Example
    -------
    >>> adapt = AdaptiveMD()
    >>> adapt.nmin = 2
    >>> adapt.nmax = 3
    >>> adapt.nepochs = 2
    >>> adapt.ticadim = 3
    >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()]
    >>> adapt.generatorspath = htmd.home()+'/data/dhfr'
    >>> adapt.app = AcemdLocal()
    >>> adapt.run()
    """

    def __init__(self):
        from sklearn.base import ClusterMixin
        from htmd.clustering.kcenters import KCenter
        from htmd.projections.projection import Projection
        super().__init__()
        self._arg('datapath', 'str', 'The directory in which the completed simulations are stored', 'data', val.String())
        self._arg('filter', 'bool', 'Enable or disable filtering of trajectories.', True, val.Boolean())
        self._arg('filtersel', 'str', 'Filtering atom selection', 'not water', val.String())
        self._arg('filteredpath', 'str', 'The directory in which the filtered simulations will be stored', 'filtered', val.String())
        self._arg('projection', ':class:`Projection <htmd.projections.projection.Projection>` object',
                  'A Projection class object or a list of objects which will be used to project the simulation '
                   'data before constructing a Markov model', None, val.Object(Projection), nargs='+')
        self._arg('truncation', 'str', 'Method for truncating the prob distribution (None, \'cumsum\', \'statecut\'', None, val.String())
        self._arg('statetype', 'str', 'What states (cluster, micro, macro) to use for calculations.', 'micro', val.String(), valid_values=('micro', 'cluster', 'macro'))
        self._arg('macronum', 'int', 'The number of macrostates to produce', 8, val.Number(int, 'POS'))
        self._arg('skip', 'int', 'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame', 1, val.Number(int, 'POS'))
        self._arg('lag', 'int', 'The lagtime used to create the Markov model', 1, val.Number(int, 'POS'))
        self._arg('clustmethod', ':class:`ClusterMixin <sklearn.base.ClusterMixin>` class', 'Clustering algorithm used to cluster the contacts or distances', KCenter, val.Class(ClusterMixin))
        self._arg('method', 'str', 'Criteria used for choosing from which state to respawn from', '1/Mc', val.String())
        self._arg('ticalag', 'int', 'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.', 20, val.Number(int, '0POS'))
        self._arg('ticadim', 'int', 'Number of TICA dimensions to use. When set to 0 it disables TICA', 3, val.Number(int, '0POS'))
        self._arg('contactsym', 'str', 'Contact symmetry', None, val.String())
        self._arg('save', 'bool', 'Save the model generated', False, val.Boolean())

    def _algorithm(self):
        data = self._getData(self._getSimlist())
        if not self._checkNFrames(data): return False
        self._createMSM(data)

        N = self.nmax - self._running
        reward = self._criteria(self._model, self.method)
        reward = self._truncate(reward, N)
        relFrames, _, _ = self._getSpawnFrames(reward, self._model, self._model.data, N)
        self._writeInputs(self._model.data.rel2sim(np.concatenate(relFrames)))
        return True

    def _checkNFrames(self, data):
        if self.nframes != 0 and data.numFrames >= self.nframes:
            logger.info('Reached maximum number of frames. Stopping adaptive.')
            return False
        return True

    def _getSimlist(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', '')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)
        return sims

    def _getData(self, sims):
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            data = metr.project()
            data.dropTraj()  # Drop before TICA to avoid broken trajectories
            ticalag = int(
                np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()
        datadr.dropTraj()  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        return datadr

    def _createMSM(self, data):
        data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
        self._model = Model(data)
        self._model.markovModel(self.lag, self._numMacrostates(data))
        if self.save:
            if not path.exists('saveddata'):
                makedirs('saveddata')
            self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))

    def _getSpawnFrames(self, reward, model, data, N):
        prob = reward / np.sum(reward)
        logger.debug('Sampling probabilities {}'.format(prob))
        spawncounts = np.random.multinomial(N, prob)
        logger.debug('spawncounts {}'.format(spawncounts))

        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=True)
        logger.debug('relFrames {}'.format(relFrames))
        return relFrames, spawncounts, prob

    def _criteria(self, model, criteria):
        if criteria == '1/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro]
        elif criteria == 'pi/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model, model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro]*model.msm.stationary_distribution
        return ret

    def _truncate(self, ranking, N):
        if self.truncation is not None and self.truncation.lower() != 'none':
            if self.truncation == 'cumsum':
                idx = np.argsort(ranking)
                idx = idx[::-1]  # decreasing sort
                errs = ranking[idx]
                H = (N * errs / np.cumsum(errs)) < 1
                ranking[idx[H]] = 0
            if self.truncation == 'statecut':
                idx = np.argsort(ranking)
                idx = idx[::-1]  # decreasing sort
                ranking[idx[N:]] = 0  # Set all states ranked > N to zero.
        return ranking

    def _numClusters(self, numFrames):
        """ Heuristic that calculates number of clusters from number of frames """
        K = int(max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50), 100))  # heuristic
        if K > numFrames / 3:  # Ugly patch for low-data regimes ...
            K = int(numFrames / 3)
        return K

    def _numMacrostates(self, data):
        """ Heuristic for calculating the number of macrostates for the Markov model """
        macronum = self.macronum
        if data.K < macronum:
            macronum = np.ceil(data.K / 2)
            logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum))

        # Calculating how many timescales are above the lag time to limit number of macrostates
        from pyemma.msm import timescales_msm
        timesc = timescales_msm(data.St.tolist(), lags=self.lag, nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))
        return macronum
示例#19
0
class AdaptiveMD(AdaptiveBase):
    """ Adaptive class which uses a Markov state model for respawning

    AdaptiveMD uses Markov state models to choose respawning poses for the next epochs. In more detail, it projects all
    currently retrieved simulations according to the specified projection, clusters those and then builds a Markov model using
    the discretized trajectories. From the Markov model it then chooses conformations from the various states based on
    the chosen criteria which will be used for starting new simulations.

    Parameters
    ----------
    app : :class:`App <htmd.apps.app.App>` object, default=None
        An App class object used to retrieve and submit simulations
    project : str, default='adaptive'
        The name of the project
    nmin : int, default=1
        Minimum number of running simulations
    nmax : int, default=1
        Maximum number of running simulations
    nepochs : int, default=100
        Maximum number of epochs
    inputpath : str, default='input'
        The directory used to store input folders
    generatorspath : str, default='generators'
        The directory containing the generators
    dryrun : boolean, default=False
        A dry run means that the adaptive will retrieve and generate a new epoch but not submit the simulations
    updateperiod : float, default=0
        When set to a value other than 0, the adaptive will run synchronously every `updateperiod` seconds
    datapath : str, default='data'
        The directory in which the completed simulations are stored
    filter : bool, default=True
        Enable or disable filtering of trajectories.
    filtersel : str, default='not water'
        Filtering atom selection
    filteredpath : str, default='filtered'
        The directory in which the filtered simulations will be stored
    projection : :class:`Projection <htmd.projections.projection.Projection>` object, default=None
        A Projection class object or a list of objects which will be used to project the simulation data before constructing a Markov model
    macronum : int, default=8
        The number of macrostates to produce
    skip : int, default=1
        Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame
    lag : int, default=1
        The lagtime used to create the Markov model
    clustmethod : :class:`ClusterMixin <sklearn.base.ClusterMixin>` object, default=<class 'sklearn.cluster.k_means_.MiniBatchKMeans'>
        Clustering algorithm used to cluster the contacts or distances
    method : str, default='1/Mc'
        Criteria used for choosing from which state to respawn from
    ticalag : int, default=20
        Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.
    ticadim : int, default=3
        Number of TICA dimensions to use. When set to 0 it disables TICA
    contactsym : str, default=None
        Contact symmetry
    save : bool, default=False
        Save the model generated

    Example
    -------
    >>> adapt = AdaptiveMD()
    >>> adapt.nmin = 2
    >>> adapt.nmax = 3
    >>> adapt.nepochs = 2
    >>> adapt.ticadim = 3
    >>> adapt.projection = [MetricDistance('name CA', 'name N'), MetricDihedral()]
    >>> adapt.generatorspath = htmd.home()+'/data/dhfr'
    >>> adapt.app = AcemdLocal()
    >>> adapt.run()
    """
    def __init__(self):
        from sklearn.base import ClusterMixin
        from htmd.projections.projection import Projection
        super().__init__()
        self._cmdString(
            'datapath', 'str',
            'The directory in which the completed simulations are stored',
            'data')
        self._cmdBoolean('filter', 'bool',
                         'Enable or disable filtering of trajectories.', True)
        self._cmdString('filtersel', 'str', 'Filtering atom selection',
                        'not water')
        self._cmdString(
            'filteredpath', 'str',
            'The directory in which the filtered simulations will be stored',
            'filtered')
        self._cmdObject(
            'projection',
            ':class:`Projection <htmd.projections.projection.Projection>` object',
            'A Projection class object or a list of objects which will be used to project the simulation '
            'data before constructing a Markov model', None, Projection)
        self._cmdValue('macronum', 'int',
                       'The number of macrostates to produce', 8, TYPE_INT,
                       RANGE_POS)
        self._cmdValue(
            'skip', 'int',
            'Allows skipping of simulation frames to reduce data. i.e. skip=3 will only keep every third frame',
            1, TYPE_INT, RANGE_POS)
        self._cmdValue('lag', 'int',
                       'The lagtime used to create the Markov model', 1,
                       TYPE_INT, RANGE_POS)
        self._cmdObject(
            'clustmethod',
            ':class:`ClusterMixin <sklearn.base.ClusterMixin>` object',
            'Clustering algorithm used to cluster the contacts or distances',
            MiniBatchKMeans, ClusterMixin)
        self._cmdString(
            'method', 'str',
            'Criteria used for choosing from which state to respawn from',
            '1/Mc')
        self._cmdValue(
            'ticalag', 'int',
            'Lagtime to use for TICA in frames. When using `skip` remember to change this accordinly.',
            20, TYPE_INT, RANGE_0POS)
        self._cmdValue(
            'ticadim', 'int',
            'Number of TICA dimensions to use. When set to 0 it disables TICA',
            3, TYPE_INT, RANGE_0POS)
        self._cmdString('contactsym', 'str', 'Contact symmetry', None)
        self._cmdBoolean('save', 'bool', 'Save the model generated', False)

    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        self._model = Model(datadr)
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        relFrames = self._getSpawnFrames(self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))

    def _getSpawnFrames(self, model, data):
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax - self._running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx,
                                          spawncounts[stateIdx],
                                          statetype='micro',
                                          replacement=(data.K < 10))
        logger.debug('relFrames {}'.format(relFrames))
        return relFrames

    def _criteria(self, model, criteria):
        if criteria == '1/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model,
                                      model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro]
        elif criteria == 'pi/Mc':
            nMicroPerMacro = macroAccumulate(model, np.ones(model.micronum))
            P_I = 1 / macroAccumulate(model,
                                      model.data.N[model.cluster_ofmicro])
            P_I = P_I / nMicroPerMacro
            ret = P_I[model.macro_ofmicro] * model.msm.stationary_distribution
        return ret

    def _spawn(self, ranking, N, truncated=False):
        if truncated:
            idx = np.argsort(ranking)
            idx = idx[::-1]  # decreasing sort
            errs = ranking[idx]
            H = (N * errs / np.cumsum(errs)) < 1
            ranking[idx[H]] = 0
        prob = ranking / np.sum(ranking)
        spawnmicro = np.random.multinomial(N, prob)
        return spawnmicro, prob

    def _numClusters(self, numFrames):
        """ Heuristic that calculates number of clusters from number of frames """
        K = int(
            max(np.round(0.6 * np.log10(numFrames / 1000) * 1000 + 50),
                100))  # heuristic
        if K > numFrames / 3:  # Ugly patch for low-data regimes ...
            K = int(numFrames / 3)
        return K

    def _numMacrostates(self, data):
        """ Heuristic for calculating the number of macrostates for the Markov model """
        macronum = self.macronum
        if data.K < macronum:
            macronum = np.ceil(data.K / 2)
            logger.warning(
                'Using less macrostates than requested due to lack of microstates. macronum = '
                + str(macronum))

        # Calculating how many timescales are above the lag time to limit number of macrostates
        from pyemma.msm import timescales_msm
        timesc = timescales_msm(data.St.tolist(), lags=self.lag,
                                nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))
        return macronum
示例#20
0
 def _createMSM(self, data):
     data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         self._model.save('adapt_model_e{}.dat'.format(self._getEpoch()))