Exemplo n.º 1
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        self._model = Model(datadr)
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        relFrames = self._getSpawnFrames(self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
Exemplo n.º 2
0
    def handle_model(self):
        """Creates a model is model is not set. Loads a model from a string. Or assign a model to self.model.out_folder

        Calling this function results in self.model to be and htmd.model.Model class
        """
        from htmd.model import Model
        from htmd.molecule.molecule import Molecule

        if not self.model:
            from IDP_htmd.IDP_analysis import analyze_folder
            print("Creating new analysis")
            self.write_parameters()
            self.model = analyze_folder(self.input_folder, self.out_folder, self.skip, self.metrics, self.cluster,
                self.tica, self.ticadim, self.ticalag, self.modellag, self.modelunits, self.macronum, self.bulk_split, 
                self.fes, self.rg_analysis, self.save_model, self.data_fstep)

        if isinstance(self.model, str):
            try:
                print("Loading model")
                model = Model()
                model.load(self.model)
                self.model = model
            except:
                print("Could not load the model")
                return

        if isinstance(self.model, Model):
            print("Model loaded")

        self.mol = Molecule(self.model.data.simlist[0].molfile)
Exemplo n.º 3
0
 def _createMSM(self, data):
     data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         if not path.exists('saveddata'):
             makedirs('saveddata')
         self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
Exemplo n.º 4
0
def bootstrap(model, rounds, fraction=0.8, clusters=500):
    from htmd.model import Model
    from sklearn.cluster import MiniBatchKMeans

    for boot_round in range(rounds):
        dataBoot = model.data.bootstrap(fraction)
        print(f"Starting a new round of bootstrap - {boot_round}")
        dataBoot.cluster(MiniBatchKMeans(n_clusters=clusters), mergesmall=5)
        b_model = Model(dataBoot)
        yield (b_model)
Exemplo n.º 5
0
def viewModel(model_name):
    model = Model(file=model_name)
    try:
        model.macronum
    except:
        model.markovModel(20, 5, units="ns")

    model.viewStates(alignsel="noh and resname MOL",
                     protein=True,
                     ligand="protein and backbone")
Exemplo n.º 6
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        model = Model(datadr)
        self._model = model
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        # Undirected component
        uc = -model.data.N  # Lower counts should give higher score hence the -
        if self.statetype == 'micro':
            uc = uc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            uc = macroAccumulate(model, uc[model.cluster_ofmicro])

        # Calculating the directed component
        dc = self._calculateDirectedComponent(sims, model.data.St,
                                              model.data.N)
        if self.statetype == 'micro':
            dc = dc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            dc = macroAccumulate(model, dc[model.cluster_ofmicro])

        uc = self._featScale(uc)
        dc = self._featScale(dc)

        reward = dc + self.ucscale * uc

        relFrames = self._getSpawnFrames(reward, self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
Exemplo n.º 7
0
 def _createMSM(self, data):
     from htmd.model import Model
     kmeanserror = True
     while kmeanserror:
         try:
             data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
         except IndexError:
             continue
         kmeanserror = False
         
     self._model = Model(data)
     self._model.markovModel(self.lag, self._numMacrostates(data))
     if self.save:
         makedirs('saveddata', exist_ok=True)
         self._model.save(path.join('saveddata', 'e{}_adapt_model.dat'.format(self._getEpoch())))
Exemplo n.º 8
0
def scan_clusters(model, nclusters, out_dir):
    """Create models 
    
    In order to assess the effect on timescales using different clusters in a model.
    Parameters
    ----------
    model : htmd.model.Model
        Model class we want to perfom the analysis
    nclusters : int[]
        Array of clusters to be tested
    out_dir : str
        Directory to save the generated plots
    """
    from sklearn.cluster import MiniBatchKMeans
    for i in nclusters:
        model.data.cluster(MiniBatchKMeans(n_clusters=i), mergesmall=5)
        new_mod = Model(model.data)
        new_mod.plotTimescales(plot=False, save=f"{out_dir}/1_its-{i}_clu")
Exemplo n.º 9
0
    def _algorithm(self):
        logger.info('Postprocessing new data')
        datalist = simlist(
            glob(path.join(self.datapath, '*', '')),
            glob(path.join(self.inputpath, '*', 'structure.pdb')),
            glob(path.join(self.inputpath, '*', '')))
        filtlist = simfilter(datalist,
                             self.filteredpath,
                             filtersel=self.filtersel)

        if hasattr(self, 'metricsel2') and self.metricsel2 is not None:
            proj = MetricDistance(self.metricsel1,
                                  self.metricsel2,
                                  metric=self.metrictype)
        else:
            proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype)
        metr = Metric(filtlist, skip=self.skip)
        metr.projection(proj)
        data = metr.project()

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        data.dropTraj()
        if self.ticadim > 0:
            tica = TICA(data, int(max(2, np.ceil(20 / self.skip))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = data

        K = int(
            max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50),
                100))  # heuristic
        if K > datadr.numFrames / 3:  # Freaking ugly patches ...
            K = int(datadr.numFrames / 3)

        datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5)
        replacement = False
        if datadr.K < 10:
            datadr.cluster(self.clustmethod(n_clusters=K))
            replacement = True

        model = Model(datadr)
        macronum = self.macronum
        if datadr.K < macronum:
            macronum = np.ceil(datadr.K / 2)
            logger.warning(
                'Using less macrostates than requested due to lack of microstates. macronum = '
                + str(macronum))

        from pyemma.msm import timescales_msm
        timesc = timescales_msm(datadr.St.tolist(),
                                lags=self.lag,
                                nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))

        model.markovModel(self.lag, macronum)
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx,
                                          spawncounts[stateIdx],
                                          statetype='micro',
                                          replacement=replacement)
        logger.debug('relFrames {}'.format(relFrames))

        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))
Exemplo n.º 10
0
def analyze_folder(folder=None,
                   out_folder="/tmp",
                   skip=1,
                   metrics=None,
                   clu=500,
                   tica=True,
                   ticadim=5,
                   tica_lag=20,
                   model_lag=10,
                   model_units='ns',
                   macro_N=10,
                   bulk_split=False,
                   fes=True,
                   rg_analysis=True,
                   save=True,
                   data_fstep=None):
    """Analysis script for create a Markov State Model
    
    Creates and returns a Markov State Model given a data folder.
    Intented to follow up the evolution of an adaptive sampling run.
    Allows to save the model ans several informative plots
    
    Parameters
    ----------
    folder : str
        Data folder where adaptive is running
    out_folder : str
        Output folder to store derived data
    skip : int
        Number of frames to skip while projecting the MD data
    metrics : [:class: `Metric` object]
        Metric array used to project the data
    clu : int
        Number of cluster to create using the MiniBatchKMeans method.
    tica: bool
        Wether to use TICA of GWPCA for dimensionality reduction
    ticadim : int
        Number of TICA dimension to project the data. If None, the model will be created using the raw projected data
    tica_lag : int, optional
        Description
    model_lag : int
        Number of ns used to create the model
    model_units : str, optional
        Description
    macro_N : int
        Number of macrostate to split the final Markov State Model
    fes : bool, optional
        If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined
    rg_analysis : bool, optional
        If true, a plot with information relative to the radious of gyration of the molecule will be created.
    save : bool, optional
        If true, the model will be saved in the outputs folder
    
    Returns
    -------
    :class:`Model`
        Final model
    """
    from htmd.model import Model
    from htmd.molecule.molecule import Molecule
    from htmd.simlist import simlist
    from htmd.projections.metric import Metric
    from sklearn.cluster import MiniBatchKMeans
    from IDP_htmd.IDP_model import plot_RG
    from IDP_htmd.model_utils import create_bulk
    from glob import glob
    import os

    try:
        os.mkdir(out_folder)
    except:
        print("Folder already exists")

    try:
        fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True)
        print(f"Loaded {folder}/simlist.npy")
    except:
        print("Creating simlist")
        sims = glob(folder + 'filtered/*/')
        fsims = simlist(sims, folder + 'filtered/filtered.pdb')
    metr = Metric(fsims, skip=skip)
    metr.set(metrics)

    #Check if this gives problems to ITS

    try:
        model = Model(file=f"{out_folder}/model.dat")
        out_data = model.data
        print(f"Loading model: {out_folder}/model.dat")
    except:
        if tica and ticadim:
            from htmd.projections.tica import TICA
            print("Projecting TICA")
            tica = TICA(metr, tica_lag)
            out_data = tica.project(ticadim)
        elif not tica and ticadim:
            from htmd.projections.gwpca import GWPCA
            data = metr.project()
            data.dropTraj()
            print("using GWPCA")
            gwpca = GWPCA(data, tica_lag)
            out_data = gwpca.project(ticadim)
        else:
            print("Not using TICA")
            data = metr.project()
            data.dropTraj()
            out_data = data

    #Avoid some possibles error while clustering
    if data_fstep: out_data.fstep = data_fstep
    x = True
    while x:
        try:
            out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5)
            x = False
        except Exception as e:
            raise Exception("Error " + str(e))

    model = Model(out_data)
    model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png")

    if macro_N:
        model.markovModel(model_lag, macro_N, units=model_units)

        if bulk_split:
            try:
                print("Starting bulk splitting")
                create_bulk(model, bulk_split)
            except Exception as e:
                print("Could not perform the bulk splitting")
                print(e)

        model.eqDistribution(plot=False,
                             save=f"{out_folder}/1.2_eqDistribution.png")

        if rg_analysis:
            from IDP_htmd.IDP_analysis import rg_analysis
            mol = Molecule(model.data.simlist[0].molfile)
            rg_data = rg_analysis(model, skip=skip)
            plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png")

        # if fes and ticadim:
        # model.plotFES(0, 1, temperature=310, states=True,
        #     plot=False, save=f"{out_folder}/1.3_fes.png")

    if save:
        model.save(f"{out_folder}/model.dat")

    return model
Exemplo n.º 11
0
    from htmd.model import Model

    mt = ModelAnalysis("/workspace8/excitome/adaptiveRun/O75376_MOR_58/",
        "/home/pablo/testModel/")

    mt.metrics = [
                MetricDistance(
                sel1="noh and protein",
                sel2="noh and protein",
                metric="contacts",
                threshold=5,
                groupsel1="residue",
                groupsel2="residue")
            ]

    model = Model()
    model.load("/home/pablo/testModel/model.dat")
    mt.model = model
    mt.handle_model()
    mt.sasa_variation()
    # mt.model = "/home/pablo/testModel/model.dat"
    # mt.plot_dihedral = "2_dihedral"
    # mt.macronum = 4
    # mt.plot_contacts = [
    #     ('all_contacts', 'noh and protein', 5),
    #     ('backbone', 'noh and backbone', 5),
    #     ('sidechain', 'noh and sidechain', 4),
    # ]
    # mt.write_parameters()
    # mt.generate_html_summary()
Exemplo n.º 12
0
    def _createMSM(self,
                   epoch,
                   output_folder,
                   basedata=None,
                   skip=1,
                   clusters=0,
                   ticadim=0,
                   ticalag=20,
                   macronum=2,
                   modellag=5,
                   modelunits="frames",
                   fstep=None,
                   data2combine=None):
        from htmd.projections.tica import TICA
        from sklearn.cluster import MiniBatchKMeans
        from htmd.model import Model

        try:
            model = Model(
                file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat"
            )

            if (model.macronum != macronum or model.lag != modellag):
                model.markovModel(modellag, macronum, units=modelunits)
            print("Model loaded")
        except:
            if not self.precalculated_data and not self.low_memory_usage:
                print("Calculating PRECALC DATA")
                precalc_data = self._precalculateData(self.precalc_metric,
                                                      self.input_folder,
                                                      fstep=fstep,
                                                      skip=skip)
                self.precalc_data = precalc_data
                self.precalculated_data = True

            if self.analysis_type == "epoch" and not self.low_memory_usage:
                epoch_sim = np.concatenate(
                    np.array([
                        self.epoch_sim_indexes[i] for i in range(1, epoch + 1)
                        if i in list(self.epoch_sim_indexes.keys())
                    ]))
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                drop_traj_idx[epoch_sim] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]
            elif self.analysis_type == "sims" and not self.low_memory_usage:
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                no_drop_idx = np.arange(1, epoch)
                drop_traj_idx[no_drop_idx] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]

            if not self.low_memory_usage:
                data = self.precalc_data.copy()
                data.dropTraj(idx=drop_idx)
                data.dropTraj()

            if basedata:
                from htmd.projections.metric import MetricData
                r_fit = self._fitBaseline(data, basedata)
                data = MetricData(dat=r_fit, simlist=data.simlist)
            elif ticadim and not self.low_memory_usage:
                tica = TICA(data, ticalag)
                data = tica.project(ticadim)
            elif ticadim and self.low_memory_usage:
                from htmd.projections.metric import Metric
                if self.analysis_type == "epoch":
                    epoch_sim = np.concatenate(
                        np.array([
                            self.epoch_sim_indexes[i]
                            for i in range(1, epoch + 1)
                            if i in list(self.epoch_sim_indexes.keys())
                        ]))
                else:
                    epoch_sim = range(0, epoch)
                metr = Metric(self._sims[epoch_sim], skip=skip)
                metr.set(self.precalc_metric)
                tica = TICA(metr, ticalag)
                data = tica.project(ticadim)
            if not clusters:
                clusters = self._numClusters(data.numFrames)

            if data2combine:
                try:
                    print("Adding extra dimension")
                    data2combine_copy = data2combine.copy()
                    data2combine_copy.dropTraj(keepsims=data.simlist)
                    data.combine(data2combine_copy)
                except Exception as e:
                    print("Could not combined data", str(e))

            data.cluster(MiniBatchKMeans(clusters), mergesmall=5)
            model = Model(data)
            model.markovModel(modellag, macronum, units=modelunits)
            model.save(
                f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat")

        for name, met in self.associated_metrics.items():
            try:
                self.associated_data[name]
            except:
                print(f"Calcualtion associted data - {name.upper()}")
                assoc_data = self._precalculateData(met,
                                                    self.input_folder,
                                                    fstep=fstep,
                                                    skip=skip)
                self.associated_data[name] = assoc_data

        for name, data in self.associated_data.items():
            tmp_data = data.copy()
            tmp_data.dropTraj(keepsims=model.data.simlist)
            self.tmp_associated_data[name] = tmp_data

        return model