Python TICA.project示例，htmd.projections.tica.TICA.project Python示例

示例#1

0

显示文件

文件： adaptiverun.py 项目： jhprinz/htmd

    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        self._model = Model(datadr)
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        relFrames = self._getSpawnFrames(self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))

示例#2

0

显示文件

    def _algorithm(self):
        logger.info('Postprocessing new data')
        sims = simlist(glob(path.join(self.datapath, '*', '')),
                       glob(path.join(self.inputpath, '*', 'structure.pdb')),
                       glob(path.join(self.inputpath, '*', '')))
        if self.filter:
            sims = simfilter(sims, self.filteredpath, filtersel=self.filtersel)

        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            tica = TICA(metr.project(), int(max(2, np.ceil(self.ticalag))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()

        datadr.dropTraj(
        )  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        datadr.cluster(
            self.clustmethod(n_clusters=self._numClusters(datadr.numFrames)))
        model = Model(datadr)
        self._model = model
        self._model.markovModel(self.lag, self._numMacrostates(datadr))
        if self.save:
            self._model.save('adapt_model_e' + str(self._getEpoch()) + '.dat')

        # Undirected component
        uc = -model.data.N  # Lower counts should give higher score hence the -
        if self.statetype == 'micro':
            uc = uc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            uc = macroAccumulate(model, uc[model.cluster_ofmicro])

        # Calculating the directed component
        dc = self._calculateDirectedComponent(sims, model.data.St,
                                              model.data.N)
        if self.statetype == 'micro':
            dc = dc[model.cluster_ofmicro]
        if self.statetype == 'macro':
            dc = macroAccumulate(model, dc[model.cluster_ofmicro])

        uc = self._featScale(uc)
        dc = self._featScale(dc)

        reward = dc + self.ucscale * uc

        relFrames = self._getSpawnFrames(reward, self._model, datadr)
        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))

示例#3

0

显示文件

文件： adaptiverun.py 项目： PabloHN/htmd

    def _algorithm(self):
        logger.info('Postprocessing new data')
        datalist = simlist(glob(path.join(self.datapath, '*', '')), glob(path.join(self.inputpath, '*', 'structure.pdb')),
                           glob(path.join(self.inputpath, '*', '')))
        filtlist = simfilter(datalist, self.filteredpath, filtersel=self.filtersel)

        if hasattr(self, 'metricsel2') and self.metricsel2 is not None:
            proj = MetricDistance(self.metricsel1, self.metricsel2, metric=self.metrictype)
        else:
            proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype)
        metr = Metric(filtlist, skip=self.skip)
        metr.projection(proj)
        data = metr.project()

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        data.dropTraj()
        if self.ticadim > 0:
            tica = TICA(data, int(max(2, np.ceil(20/self.skip))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = data

        K = int(max(np.round(0.6 * np.log10(datadr.numFrames/1000)*1000+50), 100))  # heuristic
        if K > datadr.numFrames / 3: # Freaking ugly patches ...
            K = int(datadr.numFrames / 3)

        datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5)
        replacement = False
        if datadr.K < 10:
            datadr.cluster(self.clustmethod(n_clusters=K))
            replacement = True

        model = Model(datadr)
        macronum = self.macronum
        if datadr.K < macronum:
            macronum = np.ceil(datadr.K / 2)
            logger.warning('Using less macrostates than requested due to lack of microstates. macronum = ' + str(macronum))

        from pyemma.msm import timescales_msm
        timesc = timescales_msm(datadr.St.tolist(), lags=self.lag, nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))

        model.markovModel(self.lag, macronum)
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax-self.running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx, spawncounts[stateIdx], statetype='micro', replacement=replacement)
        logger.debug('relFrames {}'.format(relFrames))

        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))

示例#4

0

显示文件

文件： adaptiverun.py 项目： raimis/htmd

    def _getData(self, sims):
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            data = metr.project()
            data.dropTraj()  # Drop before TICA to avoid broken trajectories
            ticalag = int(
                np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()
        datadr.dropTraj()  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        return datadr

示例#5

0

显示文件

文件： adaptiverun.py 项目： alejandrovr/htmd

    def _getData(self, sims):
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        # if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        if self.ticadim > 0:
            # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
            data = metr.project()
            data.dropTraj()  # Drop before TICA to avoid broken trajectories
            ticalag = int(
                np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datadr = tica.project(self.ticadim)
        else:
            datadr = metr.project()
        datadr.dropTraj()  # Preferably we should do this before any projections. Corrupted sims can affect TICA
        return datadr

示例#6

0

显示文件

    def _algorithm(self):
        from htmd.kinetics import Kinetics
        sims = self._getSimlist()
        metr = Metric(sims, skip=self.skip)
        metr.set(self.projection)

        data = metr.project()
        data.dropTraj()  # Drop before TICA to avoid broken trajectories

        if self.goalfunction is not None:
            goaldata = self._getGoalData(data.simlist)
            if len(data.simlist) != len(goaldata.simlist):
                raise RuntimeError('The goal function was not able to project all trajectories that the MSM projection could. Check for possible errors in the goal function.')
            goaldataconcat = np.concatenate(goaldata.dat)
            if self.save:
                makedirs('saveddata', exist_ok=True)
                goaldata.save(path.join('saveddata', 'e{}_goaldata.dat'.format(self._getEpoch())))

        # tica = TICA(metr, int(max(2, np.ceil(self.ticalag))))  # gianni: without project it was tooooo slow
        if self.ticadim > 0:
            ticalag = int(np.ceil(max(2, min(np.min(data.trajLengths) / 2, self.ticalag))))  # 1 < ticalag < (trajLen / 2)
            tica = TICA(data, ticalag)
            datatica = tica.project(self.ticadim)
            if not self._checkNFrames(datatica): return False
            self._createMSM(datatica)
        else:
            if not self._checkNFrames(data): return False
            self._createMSM(data)

        confstatdist = self.conformationStationaryDistribution(self._model)
        if self.actionspace == 'metric':
            if not data.K:
                data.cluster(self.clustmethod(n_clusters=self._numClusters(data.numFrames)))
            data_q = data.copy()
        elif self.actionspace == 'goal':
            data_q = goaldata.copy()
        elif self.actionspace == 'tica':
            data_q = datatica.copy()
        elif self.actionspace == 'ticapcca':
            data_q = datatica.copy()
            for traj in data_q.trajectories:
                traj.cluster = self._model.macro_ofcluster[traj.cluster]
            data_q.K = self._model.macronum

        if self.recluster:
            print('Reclustering with {}'.format(self.reclusterMethod))
            data_q.cluster(self.reclusterMethod)
        
        numstates = data_q.K
        print('Numstates: {}'.format(numstates))
        currepoch = self._getEpoch()
        q_values = np.zeros(numstates, dtype=np.float32)
        n_values = np.zeros(numstates, dtype=np.int32)

        if self.random:  # If random mode respawn from random action states
            action_sel = np.zeros(numstates, dtype=int)
            N = self.nmax - self._running
            randomactions = np.bincount(np.random.randint(numstates, size=N))
            action_sel[:len(randomactions)] = randomactions
            if self.save_qval:
                makedirs('saveddata', exist_ok=True)
                np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel)
            relFrames = self._getSpawnFrames_UCB(action_sel, data_q)
            self._writeInputs(data.rel2sim(np.concatenate(relFrames)))
            return True

        if self.goalfunction is not None:
            ## For every cluster in data_q, get the max score and initialize
            if self.goal_preprocess is not None:
                goaldataconcat = self.goal_preprocess(goaldataconcat)
            qstconcat = np.concatenate(data_q.St)
            statemaxes = np.zeros(numstates)
            np.maximum.at(statemaxes, qstconcat, np.squeeze(goaldataconcat))
            if not self.pucb:
                goalenergies = -Kinetics._kB * self.temperature * np.log(1-statemaxes)
                q_values = goalenergies
                n_values += int((self.nframes / self._numClusters(self.nframes)) * self.goal_init) ## Needs nframes to be set properly!!!!!!!!

        rewardtraj = np.arange(data_q.numTrajectories) # Recalculate reward for all states
        rewards = self.getRewards(rewardtraj, data_q, confstatdist, numstates, self.reward_method, self.reward_mode, self.reward_window)
        for i in range(numstates):
            if len(rewards[i]) == 0:
                continue
            q_values[i] = updatingMean(q_values[i], n_values[i], rewards[i])
        n_values += np.array([len(x) for x in rewards])


        if self.save_qval:
            makedirs('saveddata', exist_ok=True)
            np.save(path.join('saveddata', 'e{}_qval.npy'.format(currepoch)), q_values)
            np.save(path.join('saveddata', 'e{}_nval.npy'.format(currepoch)), n_values)

        
        if self.pucb:
            ucb_values = np.array([self.count_pucb(q_values[clust], self.exploration, statemaxes[clust], currepoch + 1, n_values[clust]) for clust in range(numstates)])
        else:
            ucb_values = np.array([self.count_ucb(q_values[clust], self.exploration, currepoch + 1, n_values[clust]) for clust in range(numstates)])

        if self.save_qval:
            makedirs('saveddata', exist_ok=True)
            np.save(path.join('saveddata', 'e{}_ucbvals.npy'.format(currepoch)), ucb_values)

        N = self.nmax - self._running
        if self.actionpool <= 0:
            self.actionpool = N
       
        topactions = np.argsort(-ucb_values)[:self.actionpool]
        action = np.random.choice(topactions, N, replace=False)

        action_sel = np.zeros(numstates, dtype=int)
        action_sel[action] += 1
        while np.sum(action_sel) < N:  # When K is lower than N repeat some actions
            for a in action:
                action_sel[a] +=1
                if np.sum(action_sel) == N:
                    break

        if self.save_qval:
            np.save(path.join('saveddata', 'e{}_actions.npy'.format(currepoch)), action_sel)
        relFrames = self._getSpawnFrames_UCB(action_sel, data_q) 
        self._writeInputs(data.rel2sim(np.concatenate(relFrames)))
        return True

示例#7

0

显示文件

文件： adaptiverun.py 项目： xielm12/htmd

    def _algorithm(self):
        logger.info('Postprocessing new data')
        datalist = simlist(
            glob(path.join(self.datapath, '*', '')),
            glob(path.join(self.inputpath, '*', 'structure.pdb')),
            glob(path.join(self.inputpath, '*', '')))
        filtlist = simfilter(datalist,
                             self.filteredpath,
                             filtersel=self.filtersel)

        if hasattr(self, 'metricsel2') and self.metricsel2 is not None:
            proj = MetricDistance(self.metricsel1,
                                  self.metricsel2,
                                  metric=self.metrictype)
        else:
            proj = MetricSelfDistance(self.metricsel1, metric=self.metrictype)
        metr = Metric(filtlist, skip=self.skip)
        metr.projection(proj)
        data = metr.project()

        #if self.contactsym is not None:
        #    contactSymmetry(data, self.contactsym)

        data.dropTraj()
        if self.ticadim > 0:
            tica = TICA(data, int(max(2, np.ceil(20 / self.skip))))
            datadr = tica.project(self.ticadim)
        else:
            datadr = data

        K = int(
            max(np.round(0.6 * np.log10(datadr.numFrames / 1000) * 1000 + 50),
                100))  # heuristic
        if K > datadr.numFrames / 3:  # Freaking ugly patches ...
            K = int(datadr.numFrames / 3)

        datadr.cluster(self.clustmethod(n_clusters=K), mergesmall=5)
        replacement = False
        if datadr.K < 10:
            datadr.cluster(self.clustmethod(n_clusters=K))
            replacement = True

        model = Model(datadr)
        macronum = self.macronum
        if datadr.K < macronum:
            macronum = np.ceil(datadr.K / 2)
            logger.warning(
                'Using less macrostates than requested due to lack of microstates. macronum = '
                + str(macronum))

        from pyemma.msm import timescales_msm
        timesc = timescales_msm(datadr.St.tolist(),
                                lags=self.lag,
                                nits=macronum).get_timescales()
        macronum = min(self.macronum, max(np.sum(timesc > self.lag), 2))

        model.markovModel(self.lag, macronum)
        p_i = self._criteria(model, self.method)
        (spawncounts, prob) = self._spawn(p_i, self.nmax - self.running)
        logger.debug('spawncounts {}'.format(spawncounts))
        stateIdx = np.where(spawncounts > 0)[0]
        _, relFrames = model.sampleStates(stateIdx,
                                          spawncounts[stateIdx],
                                          statetype='micro',
                                          replacement=replacement)
        logger.debug('relFrames {}'.format(relFrames))

        self._writeInputs(datadr.rel2sim(np.concatenate(relFrames)))

示例#8

0

显示文件

def analyze_folder(folder=None,
                   out_folder="/tmp",
                   skip=1,
                   metrics=None,
                   clu=500,
                   tica=True,
                   ticadim=5,
                   tica_lag=20,
                   model_lag=10,
                   model_units='ns',
                   macro_N=10,
                   bulk_split=False,
                   fes=True,
                   rg_analysis=True,
                   save=True,
                   data_fstep=None):
    """Analysis script for create a Markov State Model
    
    Creates and returns a Markov State Model given a data folder.
    Intented to follow up the evolution of an adaptive sampling run.
    Allows to save the model ans several informative plots
    
    Parameters
    ----------
    folder : str
        Data folder where adaptive is running
    out_folder : str
        Output folder to store derived data
    skip : int
        Number of frames to skip while projecting the MD data
    metrics : [:class: `Metric` object]
        Metric array used to project the data
    clu : int
        Number of cluster to create using the MiniBatchKMeans method.
    tica: bool
        Wether to use TICA of GWPCA for dimensionality reduction
    ticadim : int
        Number of TICA dimension to project the data. If None, the model will be created using the raw projected data
    tica_lag : int, optional
        Description
    model_lag : int
        Number of ns used to create the model
    model_units : str, optional
        Description
    macro_N : int
        Number of macrostate to split the final Markov State Model
    fes : bool, optional
        If true it will save a plot projecting the first two TICA dimension. Requires ticadim to be defined
    rg_analysis : bool, optional
        If true, a plot with information relative to the radious of gyration of the molecule will be created.
    save : bool, optional
        If true, the model will be saved in the outputs folder
    
    Returns
    -------
    :class:`Model`
        Final model
    """
    from htmd.model import Model
    from htmd.molecule.molecule import Molecule
    from htmd.simlist import simlist
    from htmd.projections.metric import Metric
    from sklearn.cluster import MiniBatchKMeans
    from IDP_htmd.IDP_model import plot_RG
    from IDP_htmd.model_utils import create_bulk
    from glob import glob
    import os

    try:
        os.mkdir(out_folder)
    except:
        print("Folder already exists")

    try:
        fsims = np.load(f"{folder}/simlist.npy", allow_pickle=True)
        print(f"Loaded {folder}/simlist.npy")
    except:
        print("Creating simlist")
        sims = glob(folder + 'filtered/*/')
        fsims = simlist(sims, folder + 'filtered/filtered.pdb')
    metr = Metric(fsims, skip=skip)
    metr.set(metrics)

    #Check if this gives problems to ITS

    try:
        model = Model(file=f"{out_folder}/model.dat")
        out_data = model.data
        print(f"Loading model: {out_folder}/model.dat")
    except:
        if tica and ticadim:
            from htmd.projections.tica import TICA
            print("Projecting TICA")
            tica = TICA(metr, tica_lag)
            out_data = tica.project(ticadim)
        elif not tica and ticadim:
            from htmd.projections.gwpca import GWPCA
            data = metr.project()
            data.dropTraj()
            print("using GWPCA")
            gwpca = GWPCA(data, tica_lag)
            out_data = gwpca.project(ticadim)
        else:
            print("Not using TICA")
            data = metr.project()
            data.dropTraj()
            out_data = data

    #Avoid some possibles error while clustering
    if data_fstep: out_data.fstep = data_fstep
    x = True
    while x:
        try:
            out_data.cluster(MiniBatchKMeans(n_clusters=clu), mergesmall=5)
            x = False
        except Exception as e:
            raise Exception("Error " + str(e))

    model = Model(out_data)
    model.plotTimescales(plot=False, save=f"{out_folder}/1_its.png")

    if macro_N:
        model.markovModel(model_lag, macro_N, units=model_units)

        if bulk_split:
            try:
                print("Starting bulk splitting")
                create_bulk(model, bulk_split)
            except Exception as e:
                print("Could not perform the bulk splitting")
                print(e)

        model.eqDistribution(plot=False,
                             save=f"{out_folder}/1.2_eqDistribution.png")

        if rg_analysis:
            from IDP_htmd.IDP_analysis import rg_analysis
            mol = Molecule(model.data.simlist[0].molfile)
            rg_data = rg_analysis(model, skip=skip)
            plot_RG(rg_data, mol, save=f"{out_folder}/1.4_rg.png")

        # if fes and ticadim:
        # model.plotFES(0, 1, temperature=310, states=True,
        #     plot=False, save=f"{out_folder}/1.3_fes.png")

    if save:
        model.save(f"{out_folder}/model.dat")

    return model

示例#9

0

显示文件

    def _createMSM(self,
                   epoch,
                   output_folder,
                   basedata=None,
                   skip=1,
                   clusters=0,
                   ticadim=0,
                   ticalag=20,
                   macronum=2,
                   modellag=5,
                   modelunits="frames",
                   fstep=None,
                   data2combine=None):
        from htmd.projections.tica import TICA
        from sklearn.cluster import MiniBatchKMeans
        from htmd.model import Model

        try:
            model = Model(
                file=f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat"
            )

            if (model.macronum != macronum or model.lag != modellag):
                model.markovModel(modellag, macronum, units=modelunits)
            print("Model loaded")
        except:
            if not self.precalculated_data and not self.low_memory_usage:
                print("Calculating PRECALC DATA")
                precalc_data = self._precalculateData(self.precalc_metric,
                                                      self.input_folder,
                                                      fstep=fstep,
                                                      skip=skip)
                self.precalc_data = precalc_data
                self.precalculated_data = True

            if self.analysis_type == "epoch" and not self.low_memory_usage:
                epoch_sim = np.concatenate(
                    np.array([
                        self.epoch_sim_indexes[i] for i in range(1, epoch + 1)
                        if i in list(self.epoch_sim_indexes.keys())
                    ]))
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                drop_traj_idx[epoch_sim] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]
            elif self.analysis_type == "sims" and not self.low_memory_usage:
                drop_traj_idx = np.ones(self.precalc_data.numTrajectories)
                no_drop_idx = np.arange(1, epoch)
                drop_traj_idx[no_drop_idx] = 0
                drop_idx = np.where(drop_traj_idx == 1)[0]

            if not self.low_memory_usage:
                data = self.precalc_data.copy()
                data.dropTraj(idx=drop_idx)
                data.dropTraj()

            if basedata:
                from htmd.projections.metric import MetricData
                r_fit = self._fitBaseline(data, basedata)
                data = MetricData(dat=r_fit, simlist=data.simlist)
            elif ticadim and not self.low_memory_usage:
                tica = TICA(data, ticalag)
                data = tica.project(ticadim)
            elif ticadim and self.low_memory_usage:
                from htmd.projections.metric import Metric
                if self.analysis_type == "epoch":
                    epoch_sim = np.concatenate(
                        np.array([
                            self.epoch_sim_indexes[i]
                            for i in range(1, epoch + 1)
                            if i in list(self.epoch_sim_indexes.keys())
                        ]))
                else:
                    epoch_sim = range(0, epoch)
                metr = Metric(self._sims[epoch_sim], skip=skip)
                metr.set(self.precalc_metric)
                tica = TICA(metr, ticalag)
                data = tica.project(ticadim)
            if not clusters:
                clusters = self._numClusters(data.numFrames)

            if data2combine:
                try:
                    print("Adding extra dimension")
                    data2combine_copy = data2combine.copy()
                    data2combine_copy.dropTraj(keepsims=data.simlist)
                    data.combine(data2combine_copy)
                except Exception as e:
                    print("Could not combined data", str(e))

            data.cluster(MiniBatchKMeans(clusters), mergesmall=5)
            model = Model(data)
            model.markovModel(modellag, macronum, units=modelunits)
            model.save(
                f"{output_folder}/{self.analysis_type[0]}{epoch}_model.dat")

        for name, met in self.associated_metrics.items():
            try:
                self.associated_data[name]
            except:
                print(f"Calcualtion associted data - {name.upper()}")
                assoc_data = self._precalculateData(met,
                                                    self.input_folder,
                                                    fstep=fstep,
                                                    skip=skip)
                self.associated_data[name] = assoc_data

        for name, data in self.associated_data.items():
            tmp_data = data.copy()
            tmp_data.dropTraj(keepsims=model.data.simlist)
            self.tmp_associated_data[name] = tmp_data

        return model