def _calculateMolProp(self, mol, props="all"):
        from moleculekit.util import sequenceID

        # Calculate all properties at once since it would be too slow otherwise to redo calculations
        res = {}

        mol = mol.copy()
        mol.filter(self.sel, _logger=False)

        residues = sequenceID((mol.resid, mol.chain, mol.insertion))

        backbone = mol.atomselect("backbone")
        ca_indices = np.where(mol.name == "CA")[0].astype(np.int32)
        chainids = mol.chain[ca_indices]
        resnames = mol.resname[ca_indices]
        proline_indices = np.array(resnames == "PRO", dtype=np.int32)

        _, chain_ids = np.unique(chainids, return_inverse=True)
        chain_ids = chain_ids.astype(np.int32)

        nco_indices = np.ones((residues.max() + 1, 3), dtype=np.int32) * -1
        natriums = np.where((mol.name == "N") & backbone)[0]
        carbons = np.where((mol.name == "C") & backbone)[0]
        oxygens = np.where((mol.name == "O") & backbone)[0]
        nco_indices[residues[natriums], 0] = natriums
        nco_indices[residues[carbons], 1] = carbons
        nco_indices[residues[oxygens], 2] = oxygens

        res["ca_indices"] = ca_indices
        res["nco_indices"] = nco_indices
        res["proline_indices"] = proline_indices
        res["chain_ids"] = chain_ids
        return res
示例#2
0
def _checkChainAndSegid(mol, _loggerLevel):
    from moleculekit.util import sequenceID
    emptychains = mol.chain == ''
    emptysegids = mol.segid == ''

    if np.all(emptychains) and np.all(emptysegids):
        raise RuntimeError(
            'No chains or segments defined in Molecule.chain / Molecule.segid. Please assign either to continue with preparation.'
        )

    if np.all(emptychains) and np.any(~emptysegids):
        logger.info(
            'No chains defined in Molecule. Using segment IDs as chains for protein preparation.'
        )
        mol = mol.copy()
        mol.chain = sequenceID(mol.segid)

    if np.any(~emptysegids) and np.any(~emptychains):
        chainseq = sequenceID(mol.chain)
        segidseq = sequenceID(mol.segid)
        if not np.array_equal(chainseq, segidseq):
            logger.warning('Both chains and segments are defined in Molecule.chain / Molecule.segid, however they are inconsistent. ' \
                           'Protein preparation will use the chain information.')

    if _loggerLevel is None or _loggerLevel == 'INFO':
        chainids = np.unique(mol.chain)
        if np.any([len(cc) > 1 for cc in chainids]):
            raise RuntimeError(
                'The chain field should only contain a single character.')

        print('\n---- Molecule chain report ----')
        for c in chainids:
            chainatoms = np.where(mol.chain == c)[0]
            firstatom = chainatoms[0]
            lastatom = chainatoms[-1]
            print(f'Chain {c}:')
            print(
                f'    First residue: {mol.resname[firstatom]}:{mol.resid[firstatom]}:{mol.insertion[firstatom]}'
            )
            print(
                f'    Final residue: {mol.resname[lastatom]}:{mol.resid[lastatom]}:{mol.insertion[lastatom]}'
            )
        print('---- End of chain report ----\n')

    return mol
示例#3
0
    def project(self, mol):
        """ Project molecule.

        Parameters
        ----------
        mol : :class:`Molecule <moleculekit.molecule.Molecule>`
            A :class:`Molecule <moleculekit.molecule.Molecule>` object to project.

        Returns
        -------
        data : np.ndarray
            An array containing the projected data.
        """
        coords = super().project(mol)

        if self._refmol is None:
            refcoords = np.mean(coords, axis=0)
        else:
            _wrapref = True
            if self._pbc and (self._refmol.box is None or len(self._refmol.box)
                              == 0 or np.all(self._refmol.box == 0)):
                logger.warning(
                    "refmol doesn't contain periodic box information and will not be wrapped."
                )
                _wrapref = False
            refcoords = _MetricCoordinate(atomsel=self._atomsel,
                                          refmol=self._refmol,
                                          pbc=_wrapref).project(self._refmol)

        mapping = super().getMapping(mol)
        xyzgroups = mapping.groupby('atomIndexes').groups
        numatoms = len(xyzgroups)

        resids = sequenceID(mol.resid)

        atomfluct = np.zeros((coords.shape[0], numatoms))
        squarediff = (coords - refcoords)**2
        atomresids = np.zeros(numatoms, dtype=int)
        for i, atom in enumerate(sorted(xyzgroups.values(),
                                        key=lambda x: x[0])):
            assert len(np.unique(mapping.atomIndexes[atom])) == 1
            atomfluct[:, i] = squarediff[:, atom].sum(axis=1)
            atomresids[i] = resids[int(mapping.atomIndexes[atom[0]])]

        if self._mode == 'atom':
            return atomfluct
        elif self._mode == 'residue':
            numres = len(np.unique(atomresids))
            meanresfluct = np.zeros((coords.shape[0], numres))
            for i, r in enumerate(np.unique(atomresids)):
                meanresfluct[:, i] = atomfluct[:, atomresids == r].mean(axis=1)
            return meanresfluct
        else:
            raise RuntimeError(
                'Invalid mode {} given. Choose between `atom` and `residue`'.
                format(self._mode))
示例#4
0
    def _calculateMolProp(self, mol, props='all'):
        props = ('radii', 'atom_mapping', 'sel', 'filtersel',
                 'tokeep') if props == 'all' else props
        res = {}

        sel = mol.atomselect(self._sel)
        selidx = np.where(sel)[0]
        if 'sel' in props:
            res['sel'] = sel

        filtersel = mol.atomselect(self._filtersel)
        filterselidx = np.where(filtersel)[0]
        if 'filtersel' in props:
            res['filtersel'] = filtersel

        if len(np.setdiff1d(selidx, filterselidx)) != 0:
            raise RuntimeError(
                'Some atoms selected by `sel` are not selected by `filtersel` and thus would not be calculated. Make sure `sel` is a subset of `filtersel`.'
            )

        if 'tokeep' in props:
            filterselmod = filtersel.copy().astype(int)
            filterselmod[filterselmod == 0] = -1
            filterselmod[filtersel] = np.arange(np.count_nonzero(filtersel))
            res['tokeep'] = filterselmod[sel]

        if 'radii' in props:
            _ATOMIC_RADII = {
                'C': 1.5,
                'F': 1.2,
                'H': 0.4,
                'N': 1.10,
                'O': 1.05,
                'S': 1.6,
                'P': 1.6
            }
            elements = [n[0] for n in mol.name[filtersel]]
            atom_radii = np.vectorize(_ATOMIC_RADII.__getitem__)(elements)
            res['radii'] = np.array(atom_radii, np.float32) + self._probeRadius

        if 'atom_mapping' in props:
            if self._mode == 'atom':
                res['atom_mapping'] = np.arange(np.sum(filtersel),
                                                dtype=np.int32)
            elif self._mode == 'residue':
                from moleculekit.util import sequenceID
                res['atom_mapping'] = sequenceID(
                    (mol.resid[filtersel], mol.chain[filtersel],
                     mol.segid[filtersel])).astype(np.int32)
            else:
                raise ValueError(
                    'mode must be one of "residue", "atom". "{}" supplied'.
                    format(self._mode))

        return res
示例#5
0
    def calculateVariables(currmol):
        res = sequenceID((currmol.resid, currmol.insertion, currmol.segid, currmol.chain))
        caidx = currmol.name == 'CA'
        res = np.unique(res)
        reslen = len(res)
        # Calculate the protein sequence
        seq = ''.join([_residueNameTable[x] for x in currmol.resname[caidx]])
        seq = ct.c_char_p(seq.encode('utf-8'))

        # Keep only CA coordinates
        coords = currmol.coords[caidx, :, :].copy()
        return reslen, res.astype(np.int32), seq, coords
示例#6
0
def removeAtomsInHull(mol1, mol2, hullsel, removesel):
    """Calculates the convex hull of an atom selection in mol1 and removes atoms within that hull in mol2.

    Parameters
    ----------
    mol1 : :class:`Molecule <moleculekit.molecule.Molecule>` object
        Molecule for which to calculate the convex hull
    mol2 : :class:`Molecule <moleculekit.molecule.Molecule>` object
        Molecule which contains the atoms which we check if they are within the hull
    hullsel : str
        Atom selection string for atoms in mol1 from which to calculate the convex hull.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__
    removesel : str
        Atom selection string for atoms in mol2 from which to remove the ones which are within the hull.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__

    Returns
    -------
    newmol2 : Molecule
        mol2 but without any atoms located within the convex hull
    numrem : int
        Number of fragments removed
    """
    # TODO: Look into Morphological Snakes
    from scipy.spatial import ConvexHull

    mol2 = mol2.copy()
    # Convex hull of the protein
    hullcoords = mol1.get("coords", hullsel)
    hull = ConvexHull(hullcoords)

    sequence = sequenceID((mol2.resid, mol2.segid))
    uqres = np.unique(sequence)

    toremove = np.zeros(len(sequence), dtype=bool)
    numlipsrem = 0
    for (
        res
    ) in uqres:  # For each fragment check if it's atoms lie within the convex hull
        atoms = np.where(sequence == res)[0]
        newhull = ConvexHull(np.vstack((hullcoords, mol2.get("coords", sel=atoms))))

        # If the hull didn't change by adding the fragment, it lies within convex hull. Remove it.
        if list(hull.vertices) == list(newhull.vertices):
            toremove[atoms] = True
            numlipsrem += 1

    rematoms = mol2.atomselect(removesel)
    mol2.remove(toremove & rematoms)
    return mol2, numlipsrem
示例#7
0
def embed(mol1, mol2, gap=1.3):
    """Embeds one molecule into another removing overlaps.

    Will remove residues of mol2 which have collisions with atoms of mol1.

    Parameters
    ----------
    mol1 : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The first Molecule object
    mol2 : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The second Molecule object
    gap : float
        Minimum space in A between atoms of the two molecules

    Return
    ------
    newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The resulting Molecule object

    Example
    -------
    >>> all = embed(memb, prot)
    """
    mol1 = mol1.copy()
    mol2 = mol2.copy()
    # Set different occupancy to separate atoms of mol1 and mol2
    occ1 = mol1.get("occupancy")
    occ2 = mol2.get("occupancy")
    mol1.set("occupancy", 1)
    mol2.set("occupancy", 2)

    mol2.append(mol1)
    s1 = mol2.atomselect("occupancy 1")
    s2 = mol2.atomselect("occupancy 2")
    # Give unique "residue" beta number to all resids
    beta = mol2.get("beta")
    mol2.set("beta", sequenceID(mol2.resid))
    # Calculate overlapping atoms
    overlaps = mol2.atomselect(
        "(occupancy 2) and same beta as exwithin " + str(gap) + " of (occupancy 1)"
    )
    # Restore original beta and occupancy
    mol2.set("beta", beta)
    mol2.set("occupancy", occ1, s1)
    mol2.set("occupancy", occ2, s2)

    # Remove the overlaps
    mol2.remove(overlaps, _logger=False)
    return mol2
示例#8
0
    def _calculateMolProp(self, mol, props='all'):
        props = ('radii', 'atom_mapping', 'sel') if props == 'all' else props
        res = {}

        sel = mol.atomselect(self._sel)
        if 'sel' in props:
            res['sel'] = sel

        if 'radii' in props:
            _ATOMIC_RADII = {
                'C': 1.5,
                'F': 1.2,
                'H': 0.4,
                'N': 1.10,
                'O': 1.05,
                'S': 1.6,
                'P': 1.6
            }
            elements = [n[0] for n in mol.name[sel]]
            atom_radii = np.vectorize(_ATOMIC_RADII.__getitem__)(elements)
            res['radii'] = np.array(atom_radii, np.float32) + self._probeRadius

        if 'atom_mapping' in props:
            if self._mode == 'atom':
                res['atom_mapping'] = np.arange(np.sum(sel), dtype=np.int32)
            elif self._mode == 'residue':
                from moleculekit.util import sequenceID
                res['atom_mapping'] = sequenceID(
                    (mol.resid[sel], mol.chain[sel],
                     mol.segid[sel])).astype(np.int32)
            else:
                raise ValueError(
                    'mode must be one of "residue", "atom". "{}" supplied'.
                    format(self._mode))

        return res
示例#9
0
def tileMembrane(memb, xmin, ymin, xmax, ymax, buffer=1.5):
    """ Tile a membrane in the X and Y dimensions to reach a specific size.

    Parameters
    ----------
    memb : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The membrane to be tiled
    xmin : float
        Minimum x coordinate
    ymin : float
        Minimum y coordinate
    xmax : float
        Maximum x coordinate
    ymax : float
        Maximum y coordinate
    buffer : float
        Buffer distance between tiles

    Returns
    -------
    megamemb :
        A big membrane Molecule
    """
    from tqdm import tqdm
    memb = memb.copy()
    memb.resid = sequenceID(
        (memb.resid, memb.insertion, memb.chain, memb.segid))

    minmemb = np.min(memb.get('coords', 'water'), axis=0).flatten()

    size = np.max(memb.get('coords', 'water'), axis=0) - np.min(
        memb.get('coords', 'water'), axis=0)
    size = size.flatten()
    xreps = int(np.ceil((xmax - xmin) / size[0]))
    yreps = int(np.ceil((ymax - ymin) / size[1]))

    logger.info('Replicating Membrane {}x{}'.format(xreps, yreps))

    from moleculekit.molecule import Molecule
    megamemb = Molecule()
    bar = tqdm(total=xreps * yreps, desc='Replicating Membrane')
    k = 0
    for x in range(xreps):
        for y in range(yreps):
            tmpmemb = memb.copy()
            xpos = xmin + x * (size[0] + buffer)
            ypos = ymin + y * (size[1] + buffer)

            tmpmemb.moveBy(
                [-float(minmemb[0]) + xpos, -float(minmemb[1]) + ypos, 0])
            tmpmemb.remove('same resid as (x > {} or y > {})'.format(
                xmax, ymax),
                           _logger=False)
            if tmpmemb.numAtoms == 0:
                continue

            tmpmemb.set('segid', 'M{}'.format(k), sel='not water')
            tmpmemb.set('segid', 'MW{}'.format(k), sel='water')

            megamemb.append(tmpmemb)
            k += 1
            bar.update(1)
    bar.close()

    # Membranes don't tile perfectly. Need to remove waters that clash with lipids of other tiles
    # Some clashes will still occur between periodic images however
    megamemb.remove('same resid as water and within 1.5 of not water',
                    _logger=False)
    return megamemb
示例#10
0
def prepareProteinForAtomtyping(mol,
                                guessBonds=True,
                                protonate=True,
                                pH=7.4,
                                segment=True,
                                verbose=True):
    """Prepares a Molecule object for atom typing.

    Parameters
    ----------
    mol : Molecule object
        The protein to prepare
    guessBonds : bool
        Drops the bonds in the molecule and guesses them from scratch
    protonate : bool
        Protonates the protein for the given pH and optimizes hydrogen networks
    pH : float
        The pH for protonation
    segment : bool
        Automatically guesses the segments of a protein by using the guessed bonds
    verbose : bool
        Set to False to turn of the printing

    Returns
    -------
    mol : Molecule object
        The prepared Molecule
    """
    from moleculekit.tools.autosegment import autoSegment2
    from moleculekit.util import sequenceID

    mol = mol.copy()
    if (
            guessBonds
    ):  # Need to guess bonds at the start for atom selection and for autoSegment
        mol.bondtype = np.array([], dtype=object)
        mol.bonds = mol._guessBonds()

    protsel = mol.atomselect("protein")
    metalsel = mol.atomselect(f"element {' '.join(metal_atypes)}")
    watersel = mol.atomselect("water")
    notallowed = ~(protsel | metalsel | watersel)

    if not np.any(protsel):
        raise RuntimeError("No protein atoms found in Molecule")

    if np.any(notallowed):
        resnames = np.unique(mol.resname[notallowed])
        raise RuntimeError(
            "Found atoms with resnames {} in the Molecule which can cause issues with the voxelization. Please make sure to only pass protein atoms and metals."
            .format(resnames))

    protmol = mol.copy()
    protmol.filter(protsel, _logger=False)
    metalmol = mol.copy()
    metalmol.filter(metalsel, _logger=False)
    watermol = mol.copy()
    watermol.filter(watersel, _logger=False)

    if protonate:
        from moleculekit.tools.preparation import systemPrepare

        if np.all(protmol.segid == "") and np.all(protmol.chain == ""):
            protmol = autoSegment2(
                protmol,
                fields=("segid", "chain"),
                basename="K",
                _logger=verbose)  # We need segments to prepare the protein
        protmol = systemPrepare(
            protmol,
            pH=pH,
            verbose=verbose,
            _logger_level="INFO" if verbose else "ERROR",
        )

    if guessBonds:
        protmol.bonds = protmol._guessBonds()
        # TODO: Should we remove bonds between metals and protein?

    if segment:
        protmol = autoSegment2(
            protmol, fields=("segid", "chain"),
            _logger=verbose)  # Reassign segments after preparation

        # Assign separate segment to the metals just in case pybel takes that into account
        if np.any(protmol.chain == "Z") or np.any(protmol.segid == "ME"):
            raise AssertionError(
                "Report this issue on the moleculekit github issue tracker. Too many chains in the protein."
            )
        metalmol.segid[:] = "ME"
        metalmol.chain[:] = "Z"
        metalmol.resid[:] = (
            np.arange(0, 2 * metalmol.numAtoms, 2) + protmol.resid.max() + 1
        )  # Just in case, let's put a residue gap between the metals so that they are considered separate chains no matter what happens

        if watermol.numAtoms != 0:
            if np.any(protmol.chain == "W") or np.any(protmol.segid == "WX"):
                raise AssertionError(
                    "Report this issue on the moleculekit github issue tracker. Too many chains in the protein."
                )
            watermol.resid[:] = sequenceID(
                (watermol.resid, watermol.segid, watermol.chain), step=2)
            watermol.segid[:] = "WX"
            watermol.chain[:] = "W"

    mol = protmol.copy()
    mol.append(metalmol)
    mol.append(watermol)
    return mol
示例#11
0
文件: ionize.py 项目: yulanl22/htmd
def ionizePlace(mol,
                anion_resname,
                cation_resname,
                anion_name,
                cation_name,
                nanion,
                ncation,
                dfrom=5,
                dbetween=5,
                segname=None):
    """Place a given number of negative and positive ions in the solvent.

    Replaces water molecules al long as they respect the given distance criteria.

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule object
    anion_resname : str
        Resname of the added anions
    cation_resname : str
        Resname of the added cations
    anion_name : str
        Name of the added anions
    cation_name : str
        Name of the added cations
    nanion : int
        Number of anions to add
    ncation : int
        Number of cations to add
    dfrom : float
        Min distance of ions from molecule
    dbetween : float
        Min distance between ions
    segname : str
        Segment name to add
        
    Returns
    -------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The molecule with the ions added
    """

    newmol = mol.copy()

    logger.debug('Min distance of ions from molecule: ' + str(dfrom) + 'A')
    logger.debug('Min distance between ions: ' + str(dbetween) + 'A')
    logger.debug('Placing {:d} anions and {:d} cations.'.format(
        nanion, ncation))

    if (nanion + ncation) == 0:
        return newmol

    nions = nanion + ncation

    betabackup = newmol.beta.copy()
    newmol.set('beta',
               sequenceID((newmol.resid, newmol.insertion, newmol.segid)))

    # Find water oxygens to replace with ions
    ntries = 0
    maxtries = 10
    while True:
        ionlist = []
        watindex = newmol.atomselect('noh and water and not (within ' +
                                     str(dfrom) + ' of not water)',
                                     indexes=True)
        watsize = len(watindex)

        if watsize == 0:
            raise NameError(
                'No waters could be found further than ' + str(dfrom) +
                ' from other molecules to be replaced by ions. You might need to solvate with a bigger box or disable the ionize property when building.'
            )

        while len(ionlist) < nions:
            if len(watindex) == 0:
                break
            randwat = np.random.randint(len(watindex))
            thision = watindex[randwat]
            addit = True
            if len(ionlist) != 0:  # Check for distance from precious ions
                ionspos = newmol.get('coords', sel=ionlist)
                thispos = newmol.get('coords', sel=thision)
                dists = distance.cdist(np.atleast_2d(ionspos),
                                       np.atleast_2d(thispos),
                                       metric='euclidean')

                if np.any(dists < dbetween):
                    addit = False
            if addit:
                ionlist.append(thision)
                watindex = np.delete(watindex, randwat)
        if len(ionlist) == nions:
            break

        ntries += 1
        if ntries == maxtries:
            raise NameError(
                'Failed to add ions after ' + str(maxtries) +
                ' attempts. Try decreasing the '
                'from'
                ' and '
                'between'
                ' parameters, decreasing ion concentration or making a larger water box.'
            )

    # Delete waters but keep their coordinates
    waterpos = np.atleast_2d(newmol.get('coords', ionlist))
    betasel = np.zeros(newmol.numAtoms, dtype=bool)
    for b in newmol.beta[ionlist]:
        betasel |= newmol.beta == b
    atmrem = np.sum(betasel)
    atmput = 3 * len(ionlist)
    # assert atmrem == atmput, 'Removing {} atoms instead of {}. Report this bug.'.format(atmrem, atmput)
    sel = np.where(betasel)[0]
    newmol.remove(sel, _logger=False)
    # assert np.size(sel) == atmput, 'Removed {} atoms instead of {}. Report this bug.'.format(np.size(sel), atmput)
    betabackup = np.delete(betabackup, sel)

    # Add the ions
    randidx = np.random.permutation(np.size(waterpos, 0))
    atom = Molecule()
    atom.empty(1)
    atom.set('chain', 'I')
    atom.set('segid', 'I')

    for i in range(nanion):
        atom.set('name', anion_name)
        atom.set('resname', anion_resname)
        atom.set('resid', newmol.resid[-1] + 1)
        atom.coords = waterpos[randidx[i], :]
        newmol.insert(atom, len(newmol.name))
    for i in range(ncation):
        atom.set('name', cation_name)
        atom.set('resname', cation_resname)
        atom.set('resid', newmol.resid[-1] + 1)
        atom.coords = waterpos[randidx[i + nanion], :]
        newmol.insert(atom, len(newmol.name))

    # Restoring the original betas
    newmol.beta[:len(betabackup)] = betabackup
    return newmol
示例#12
0
def _charmmLipid2Amber(mol):
    """ Convert a CHARMM lipid membrane to AMBER format

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule object containing the membrane

    Returns
    -------
    newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        A new Molecule object with the membrane converted to AMBER
    """

    resdict = _readcsvdict(
        os.path.join(home(shareDir=True), 'builder', 'charmmlipid2amber.csv'))

    natoms = mol.numAtoms
    neworder = np.array(
        list(range(natoms)
             ))  # After renaming the atoms and residues I have to reorder them

    begs = np.zeros(natoms, dtype=bool)
    fins = np.zeros(natoms, dtype=bool)
    begters = np.zeros(natoms, dtype=bool)
    finters = np.zeros(natoms, dtype=bool)

    # Iterate over the translation dictionary
    mol = mol.copy()
    incrresids = sequenceID((mol.resid, mol.insertion, mol.segid))

    for res in resdict.keys():
        molresidx = mol.resname == res
        if not np.any(molresidx):
            continue
        names = mol.name.copy(
        )  # Need to make a copy or I accidentally double-modify atoms

        atommap = resdict[res]
        for atom in atommap.keys():
            rule = atommap[atom]

            molatomidx = np.zeros(len(names), dtype=bool)
            molatomidx[molresidx] = names[molresidx] == atom

            mol.set('resname', rule.replaceresname, sel=molatomidx)
            mol.set('name', rule.replaceatom, sel=molatomidx)
            neworder[molatomidx] = rule.order

            if rule.order == 0:  # First atom (with or without ters)
                begs[molatomidx] = True
            if rule.order == rule.natoms - 1:  # Last atom (with or without ters)
                fins[molatomidx] = True
            if rule.order == 0 and rule.ter:  # First atom with ter
                begters[molatomidx] = True
            if rule.order == rule.natoms - 1 and rule.ter:  # Last atom with ter
                finters[molatomidx] = True

    uqresids = np.unique(incrresids[begs])
    residuebegs = np.ones(len(uqresids), dtype=int) * -1
    residuefins = np.ones(len(uqresids), dtype=int) * -1
    for i in range(len(uqresids)):
        residuebegs[i] = np.where(incrresids == uqresids[i])[0][0]
        residuefins[i] = np.where(incrresids == uqresids[i])[0][-1]
    for i in range(len(residuebegs)):
        beg = residuebegs[i]
        fin = residuefins[i] + 1
        neworder[beg:fin] = neworder[beg:fin] + beg
    idx = np.argsort(neworder)

    _reorderMol(mol, idx)

    begters = np.where(begters[idx])[0]  # Sort the begs and ters
    finters = np.where(finters[idx])[0]

    #if len(begters) > 999:
    #    raise NameError('More than 999 lipids. Cannot define separate segments for all of them.')

    for i in range(len(begters)):
        map = np.zeros(len(mol.resid), dtype=bool)
        map[begters[i]:finters[i] + 1] = True
        mol.set('resid', sequenceID(mol.get('resname', sel=map)), sel=map)
        mol.set('segid', 'L{}'.format(i % 2), sel=map)

    return mol
示例#13
0
def build(mol,
          ff=None,
          topo=None,
          param=None,
          prefix='structure',
          outdir='./build',
          caps=None,
          ionize=True,
          saltconc=0,
          saltanion=None,
          saltcation=None,
          disulfide=None,
          teleap=None,
          teleapimports=None,
          execute=True,
          atomtypes=None,
          offlibraries=None,
          gbsa=False,
          igb=2):
    """ Builds a system for AMBER

    Uses tleap to build a system for AMBER. Additionally it allows the user to ionize and add disulfide bridges.

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule object containing the system
    ff : list of str
        A list of leaprc forcefield files.
        Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available forcefield files.
        Default: :func:`amber.defaultFf <htmd.builder.amber.defaultFf>`
    topo : list of str
        A list of topology `prepi/prep/in` files.
        Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available topology files.
        Default: :func:`amber.defaultTopo <htmd.builder.amber.defaultTopo>`
    param : list of str
        A list of parameter `frcmod` files.
        Use :func:`amber.listFiles <htmd.builder.amber.listFiles>` to get a list of available parameter files.
        Default: :func:`amber.defaultParam <htmd.builder.amber.defaultParam>`
    prefix : str
        The prefix for the generated pdb and psf files
    outdir : str
        The path to the output directory
        Default: './build'
    caps : dict
        A dictionary with keys segids and values lists of strings describing the caps for a particular protein segment.
        e.g. caps['P'] = ['ACE', 'NME'] or caps['P'] = ['none', 'none']. Default: will apply ACE and NME caps to every
        protein segment.
    ionize : bool
        Enable or disable ionization
    saltconc : float
        Salt concentration to add to the system after neutralization.
    saltanion : {'Cl-'}
        The anion type. Please use only AMBER ion atom names.
    saltcation : {'Na+', 'K+', 'Cs+'}
        The cation type. Please use only AMBER ion atom names.
    disulfide : list of pairs of atomselection strings
        If None it will guess disulfide bonds. Otherwise provide a list pairs of atomselection strings for each pair of
        residues forming the disulfide bridge.
    teleap : str
        Path to teLeap executable used to build the system for AMBER
    teleapimports : list
        A list of paths to pass to teLeap '-I' flag, i.e. directories to be searched
        Default: determined from :func:`amber.defaultAmberHome <htmd.builder.amber.defaultAmberHome>` and
        :func:`amber.htmdAmberHome <htmd.builder.amber.htmdAmberHome>`
    execute : bool
        Disable building. Will only write out the input script needed by tleap. Does not include ionization.
    atomtypes : list of triplets
        Custom atom types defined by the user as ('type', 'element', 'hybrid') triplets
        e.g. (('C1', 'C', 'sp2'), ('CI', 'C', 'sp3')). Check `addAtomTypes` in AmberTools docs.
    offlibraries : str or list
        A path or a list of paths to OFF library files. Check `loadOFF` in AmberTools docs.
    gbsa : bool
        Modify radii for GBSA implicit water model
    igb : int
        GB model. Select: 1 for mbondi, 2 and 5 for mbondi2, 7 for bondi and 8 for mbondi3.
        Check section 4. The Generalized Born/Surface Area Model of the AMBER manual.

    Returns
    -------
    molbuilt : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The built system in a Molecule object

    Example
    -------
    >>> from htmd.ui import *  # doctest: +SKIP
    >>> mol = Molecule("3PTB")
    >>> molbuilt = amber.build(mol, outdir='/tmp/build')  # doctest: +SKIP
    >>> # More complex example
    >>> disu = [['segid P and resid 157', 'segid P and resid 13'], ['segid K and resid 1', 'segid K and resid 25']]
    >>> molbuilt = amber.build(mol, outdir='/tmp/build', saltconc=0.15, disulfide=disu)  # doctest: +SKIP
    """
    # Remove pdb protein bonds as they can be regenerated by tleap. Keep non-protein bonds i.e. for ligands
    mol = mol.copy()
    _removeProteinBonds(mol)

    if teleap is None:
        teleap = _findTeLeap()
    else:
        if shutil.which(teleap) is None:
            raise NameError(
                'Could not find executable: `{}` in the PATH. Cannot build for AMBER.'
                .format(teleap))

    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    _cleanOutDir(outdir)
    if ff is None:
        ff = defaultFf()
    if topo is None:
        topo = defaultTopo()
    if param is None:
        param = defaultParam()
    if caps is None:
        caps = _defaultProteinCaps(mol)

    _missingSegID(mol)
    _checkMixedSegment(mol)

    mol = _charmmLipid2Amber(mol)

    _applyProteinCaps(mol, caps)

    f = open(os.path.join(outdir, 'tleap.in'), 'w')
    f.write('# tleap file generated by amber.build\n')

    # Printing out the forcefields
    for i, force in enumerate(ensurelist(ff)):
        if not os.path.isfile(force):
            force = _locateFile(force, 'ff', teleap)
            if force is None:
                continue
        newname = 'ff{}_{}'.format(i, os.path.basename(force))
        shutil.copy(force, os.path.join(outdir, newname))
        f.write('source {}\n'.format(newname))
    f.write('\n')

    if gbsa:
        gbmodels = {
            1: 'mbondi',
            2: 'mbondi2',
            5: 'mbondi2',
            7: 'bondi',
            8: 'mbondi3'
        }
        f.write('set default PBradii {}\n\n'.format(gbmodels[igb]))

    # Adding custom atom types
    if atomtypes is not None:
        atomtypes = ensurelist(tocheck=atomtypes[0], tomod=atomtypes)
        f.write('addAtomTypes {\n')
        for at in atomtypes:
            if len(at) != 3:
                raise RuntimeError(
                    'Atom type definitions have to be triplets. Check the AMBER documentation.'
                )
            f.write('    {{ "{}" "{}" "{}" }}\n'.format(at[0], at[1], at[2]))
        f.write('}\n\n')

    # Loading OFF libraries
    if offlibraries is not None:
        offlibraries = ensurelist(offlibraries)
        for i, off in enumerate(offlibraries):
            if not os.path.isfile(off):
                raise RuntimeError(
                    'Could not find off-library in location {}'.format(off))
            newname = 'offlib{}_{}'.format(i, os.path.basename(off))
            shutil.copy(off, os.path.join(outdir, newname))
            f.write('loadoff {}\n'.format(newname))

    # Loading frcmod parameters
    f.write('# Loading parameter files\n')
    for i, p in enumerate(param):
        if not os.path.isfile(p):
            p = _locateFile(p, 'param', teleap)
            if p is None:
                continue
        newname = 'param{}_{}'.format(i, os.path.basename(p))
        shutil.copy(p, os.path.join(outdir, newname))
        f.write('loadamberparams {}\n'.format(newname))
    f.write('\n')

    # Loading prepi topologies
    f.write('# Loading prepi topologies\n')
    for i, t in enumerate(topo):
        if not os.path.isfile(t):
            t = _locateFile(t, 'topo', teleap)
            if t is None:
                continue
        newname = 'topo{}_{}'.format(i, os.path.basename(t))
        shutil.copy(t, os.path.join(outdir, newname))
        f.write('loadamberprep {}\n'.format(newname))
    f.write('\n')

    f.write('# Loading the system\n')
    f.write('mol = loadpdb input.pdb\n\n')

    if np.sum(mol.atomtype != '') != 0:
        logger.debug('Writing mol2 files for input to tleap.')
        segs = np.unique(mol.segid[mol.atomtype != ''])
        combstr = 'mol = combine {mol'
        for s in segs:
            name = 'segment{}'.format(s)
            mol2name = os.path.join(outdir, '{}.mol2'.format(name))
            mol.write(mol2name, (mol.atomtype != '') & (mol.segid == s))
            if not os.path.isfile(mol2name):
                raise NameError(
                    'Could not write a mol2 file out of the given Molecule.')
            f.write('# Loading the rest of the system\n')
            f.write('{} = loadmol2 {}.mol2\n\n'.format(name, name))
            combstr += ' {}'.format(name)
        combstr += '}\n\n'
        f.write(combstr)

    # Write patches for disulfide bonds (only after ionizing)
    if not ionize:
        # TODO: Remove this once we deprecate the class
        from htmd.builder.builder import DisulfideBridge
        from moleculekit.molecule import UniqueResidueID
        if disulfide is not None and len(disulfide) != 0 and isinstance(
                disulfide[0], DisulfideBridge):
            newdisu = []
            for d in disulfide:
                r1 = UniqueResidueID.fromMolecule(
                    mol, 'resid {} and segname {}'.format(d.resid1, d.segid1))
                r2 = UniqueResidueID.fromMolecule(
                    mol, 'resid {} and segname {}'.format(d.resid2, d.segid2))
                newdisu.append([r1, r2])
            disulfide = newdisu
        # TODO: Remove up to here ----------------------

        if disulfide is not None and len(disulfide) != 0 and isinstance(
                disulfide[0][0], str):
            disulfide = convertDisulfide(mol, disulfide)

        if disulfide is None:
            logger.info('Detecting disulfide bonds.')
            disulfide = detectDisulfideBonds(mol)

        # Fix structure to match the disulfide patching
        if len(disulfide) != 0:
            torem = np.zeros(mol.numAtoms, dtype=bool)
            f.write('# Adding disulfide bonds\n')
            for d in disulfide:
                # Rename the residues to CYX if there is a disulfide bond
                atoms1 = d[0].selectAtoms(mol, indexes=False)
                atoms2 = d[1].selectAtoms(mol, indexes=False)
                mol.resname[atoms1] = 'CYX'
                mol.resname[atoms2] = 'CYX'
                # Remove (eventual) HG hydrogens on these CYS (from proteinPrepare)
                torem |= (atoms1 & (mol.name == 'HG')) | (atoms2 &
                                                          (mol.name == 'HG'))
                # Convert to stupid amber residue numbering
                uqseqid = sequenceID(
                    (mol.resid, mol.insertion, mol.segid)) + mol.resid[0]
                uqres1 = int(np.unique(uqseqid[atoms1]))
                uqres2 = int(np.unique(uqseqid[atoms2]))
                f.write('bond mol.{}.SG mol.{}.SG\n'.format(uqres1, uqres2))
            f.write('\n')
            mol.remove(torem, _logger=False)

    # Calculate the bounding box and store it in the CRD file
    f.write('setBox mol "vdw"\n\n')

    f.write('# Writing out the results\n')
    f.write('saveamberparm mol ' + prefix + '.prmtop ' + prefix + '.crd\n')
    f.write('quit')
    f.close()

    # Printing and loading the PDB file. AMBER can work with a single PDB file if the segments are separate by TER
    logger.debug('Writing PDB file for input to tleap.')
    pdbname = os.path.join(outdir, 'input.pdb')

    # mol2 files have atomtype, here we only write parts not coming from mol2
    # We need to write the input.pdb at the end since we modify the resname for disulfide bridges in mol
    mol.write(pdbname, mol.atomtype == '')
    if not os.path.isfile(pdbname):
        raise NameError(
            'Could not write a PDB file out of the given Molecule.')

    molbuilt = None
    if execute:
        if not teleapimports:
            teleapimports = []
            # Source default Amber (i.e. the same paths tleap imports)
            amberhome = defaultAmberHome(teleap=teleap)
            teleapimports += [
                os.path.join(amberhome, s)
                for s in _defaultAmberSearchPaths.values()
            ]
            if len(teleapimports) == 0:
                raise RuntimeWarning(
                    'No default Amber force-field found. Check teLeap location: {}'
                    .format(teleap))
            # Source HTMD Amber paths that contain ffs
            htmdamberdir = htmdAmberHome()
            teleapimports += [
                os.path.join(htmdamberdir, os.path.dirname(f)) for f in ff
                if os.path.isfile(os.path.join(htmdamberdir, f))
            ]
            if len(teleapimports) == 0:
                raise RuntimeError(
                    'No default Amber force-field imports found. Check '
                    '`htmd.builder.amber.defaultAmberHome()` and `htmd.builder.amber.htmdAmberHome()`'
                )
        # Set import flags for teLeap
        teleapimportflags = []
        for p in teleapimports:
            teleapimportflags.append('-I')
            teleapimportflags.append(str(p))
        logpath = os.path.abspath(os.path.join(outdir, 'log.txt'))
        logger.info('Starting the build.')
        currdir = os.getcwd()
        os.chdir(outdir)
        f = open(logpath, 'w')
        try:
            cmd = [teleap, '-f', './tleap.in']
            cmd[1:1] = teleapimportflags
            logger.debug(cmd)
            call(cmd, stdout=f)
        except:
            raise NameError('teLeap failed at execution')
        f.close()
        errors = _logParser(logpath)
        os.chdir(currdir)
        if errors:
            raise BuildError(errors + [
                'Check {} for further information on errors in building.'.
                format(logpath)
            ])
        logger.info('Finished building.')

        if os.path.exists(os.path.join(outdir, 'structure.crd')) and \
                        os.path.getsize(os.path.join(outdir, 'structure.crd')) != 0 and \
                        os.path.getsize(os.path.join(outdir, 'structure.prmtop')) != 0:
            try:
                molbuilt = Molecule(os.path.join(outdir, 'structure.prmtop'))
                molbuilt.read(os.path.join(outdir, 'structure.crd'))
            except Exception as e:
                raise RuntimeError(
                    'Failed at reading structure.prmtop/structure.crd due to error: {}'
                    .format(e))
        else:
            raise BuildError(
                'No structure pdb/prmtop file was generated. Check {} for errors in building.'
                .format(logpath))

        if ionize:
            shutil.move(os.path.join(outdir, 'structure.crd'),
                        os.path.join(outdir, 'structure.noions.crd'))
            shutil.move(os.path.join(outdir, 'structure.prmtop'),
                        os.path.join(outdir, 'structure.noions.prmtop'))
            totalcharge = np.sum(molbuilt.charge)
            nwater = np.sum(molbuilt.atomselect('water and noh'))
            anion, cation, anionatom, cationatom, nanion, ncation = ionizef(
                totalcharge,
                nwater,
                saltconc=saltconc,
                anion=saltanion,
                cation=saltcation)
            newmol = ionizePlace(mol, anion, cation, anionatom, cationatom,
                                 nanion, ncation)
            # Redo the whole build but now with ions included
            return build(newmol,
                         ff=ff,
                         topo=topo,
                         param=param,
                         prefix=prefix,
                         outdir=outdir,
                         caps={},
                         ionize=False,
                         execute=execute,
                         saltconc=saltconc,
                         disulfide=disulfide,
                         teleap=teleap,
                         atomtypes=atomtypes,
                         offlibraries=offlibraries)

    tmpbonds = molbuilt.bonds
    molbuilt.bonds = []  # Removing the bonds to speed up writing
    molbuilt.write(os.path.join(outdir, 'structure.pdb'))
    molbuilt.bonds = tmpbonds  # Restoring the bonds
    detectCisPeptideBonds(molbuilt)  # Warn in case of cis bonds
    return molbuilt
示例#14
0
def autoSegment(
        mol,
        sel="all",
        basename="P",
        spatial=True,
        spatialgap=4.0,
        fields=("segid", ),
        field=None,
        _logger=True,
):
    """Detects resid gaps in a selection and assigns incrementing segid to each fragment

    !!!WARNING!!! If you want to use atom selections like 'protein' or 'fragment',
    use this function on a Molecule containing only protein atoms, otherwise the protein selection can fail.

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule object
    sel : str
        Atom selection string on which to check for gaps.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__
    basename : str
        The basename for segment ids. For example if given 'P' it will name the segments 'P1', 'P2', ...
    spatial : bool
        Only considers a discontinuity in resid as a gap if the CA atoms have distance more than `spatialgap` Angstrom
    spatialgap : float
        The size of a spatial gap which validates a discontinuity (A)
    fields : list
        Fields in which to set the segments. Must be a combination of "chain", "segid" or only one of them.

    Returns
    -------
    newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        A new Molecule object with modified segids

    Example
    -------
    >>> newmol = autoSegment(mol, "chain B", "P", fields=("chain", "segid"))
    """
    from moleculekit.util import sequenceID

    if field is not None and isinstance(field, str):
        if field == "both":
            fields = ("chain", "segid")
        else:
            fields = (field, )

    mol = mol.copy()

    idx = mol.atomselect(sel, indexes=True)
    rid = mol.resid[idx].copy()
    residiff = np.diff(rid)
    # Points to the index before the gap!
    gappos = np.where((residiff != 1) & (residiff != 0))[0]

    idxstartseg = [idx[0]] + idx[gappos + 1].tolist()
    idxendseg = idx[gappos].tolist() + [idx[-1]]

    # Letters to be used for chains, if free: 0123456789abcd...ABCD..., minus chain symbols already used
    sel_mask = mol.atomselect(sel)
    used_chains = set(mol.chain[~sel_mask])
    available_chains = [x for x in chain_alphabet if x not in used_chains]
    used_segids = set([x[0] for x in mol.segid[~sel_mask] if x != ""])
    available_segids = [
        x for x in [basename] + segid_alphabet if x not in used_segids
    ]
    basename = available_segids[0]

    if len(gappos) == 0:
        if "chain" in fields:
            mol.set("chain", available_chains[0], sel)
        if "segid" in fields:
            mol.set("segid", basename + "0", sel)
        return mol

    if spatial:
        residbackup = mol.resid.copy()
        # Assigning unique resids to be able to do the distance selection
        mol.set("resid", sequenceID(mol.resid))

        todelete = []
        i = 0
        for s, e in zip(idxstartseg[1:], idxendseg[:-1]):
            # Get the carbon alphas of both residues  ('coords', sel='resid "{}" "{}" and name CA'.format(mol.resid[e], mol.resid[s]))
            ca1coor = mol.coords[(mol.resid == mol.resid[e])
                                 & (mol.name == "CA")]
            ca2coor = mol.coords[(mol.resid == mol.resid[s])
                                 & (mol.name == "CA")]
            if len(ca1coor) and len(ca2coor):
                dist = np.sqrt(
                    np.sum((ca1coor.squeeze() - ca2coor.squeeze())**2))
                if dist < spatialgap:
                    todelete.append(i)
            i += 1
        todelete = np.array(todelete, dtype=int)
        # Join the non-real gaps into segments
        idxstartseg = np.delete(idxstartseg, todelete + 1)
        idxendseg = np.delete(idxendseg, todelete)

        mol.set("resid", residbackup)  # Restoring the original resids

    for i, (s, e) in enumerate(zip(idxstartseg, idxendseg)):
        if "chain" in fields:
            newchainid = available_chains[i % len(available_chains)]
            if _logger:
                logger.info(
                    f"Set chain {newchainid} between resid {mol.resid[s]} and {mol.resid[e]}."
                )
            mol.chain[s:e + 1] = newchainid
        if "segid" in fields:
            newsegid = basename + str(i)
            if _logger:
                logger.info(
                    f"Created segment {newsegid} between resid {mol.resid[s]} and {mol.resid[e]}."
                )
            mol.segid[s:e + 1] = newsegid

    return mol
示例#15
0
def autoSegment(
    mol,
    sel="all",
    basename="P",
    spatial=True,
    spatialgap=4.0,
    field="segid",
    mode="alphanumeric",
    _logger=True,
):
    """Detects resid gaps in a selection and assigns incrementing segid to each fragment

    !!!WARNING!!! If you want to use atom selections like 'protein' or 'fragment',
    use this function on a Molecule containing only protein atoms, otherwise the protein selection can fail.

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule object
    sel : str
        Atom selection string on which to check for gaps.
        See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__
    basename : str
        The basename for segment ids. For example if given 'P' it will name the segments 'P1', 'P2', ...
    spatial : bool
        Only considers a discontinuity in resid as a gap of the CA atoms have distance more than `spatialgap` Angstrom
    spatialgap : float
        The size of a spatial gap which validates a discontinuity (A)
    field : str
        Field to fix. Can be "segid" (default), "chain", or "both"
    mode : str
        If set to 'numeric' it will use numbers for segment IDs.
        If set to 'alphabetic' it will use letters for segment IDs.
        If set to 'alphanumeric' it will use both numbers and letters for segment IDs.

    Returns
    -------
    newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        A new Molecule object with modified segids

    Example
    -------
    >>> newmol = autoSegment(mol,'chain B','P')
    """
    from moleculekit.util import sequenceID

    mol = mol.copy()

    idx = mol.atomselect(sel, indexes=True)
    rid = mol.resid[idx].copy()
    residiff = np.diff(rid)
    # Points to the index before the gap!
    gappos = np.where((residiff != 1) & (residiff != 0))[0]

    # Letters to be used for chains, if free: 0123456789abcd...ABCD..., minus chain symbols already used
    used_chains = set(mol.chain)
    chain_alphabet = _getChainAlphabet(mode)

    available_chains = [x for x in chain_alphabet if x not in used_chains]

    idxstartseg = [idx[0]] + idx[gappos + 1].tolist()
    idxendseg = idx[gappos].tolist() + [idx[-1]]

    mol.set("segid", basename, sel)

    if len(gappos) == 0:
        mol.set("segid", basename + chain_alphabet[0], sel)
        return mol

    if spatial:
        residbackup = mol.resid.copy()
        # Assigning unique resids to be able to do the distance selection
        mol.set("resid", sequenceID(mol.resid))

        todelete = []
        i = 0
        for s, e in zip(idxstartseg[1:], idxendseg[:-1]):
            # Get the carbon alphas of both residues  ('coords', sel='resid "{}" "{}" and name CA'.format(mol.resid[e], mol.resid[s]))
            ca1coor = mol.coords[(mol.resid == mol.resid[e]) & (mol.name == "CA")]
            ca2coor = mol.coords[(mol.resid == mol.resid[s]) & (mol.name == "CA")]
            if len(ca1coor) and len(ca2coor):
                dist = np.sqrt(np.sum((ca1coor.squeeze() - ca2coor.squeeze()) ** 2))
                if dist < spatialgap:
                    todelete.append(i)
            i += 1
        todelete = np.array(todelete, dtype=int)
        # Join the non-real gaps into segments
        idxstartseg = np.delete(idxstartseg, todelete + 1)
        idxendseg = np.delete(idxendseg, todelete)

        mol.set("resid", residbackup)  # Restoring the original resids

    i = 0
    for s, e in zip(idxstartseg, idxendseg):
        # Fixup segid
        if field in ["segid", "both"]:
            newsegid = basename + str(i)
            if np.any(mol.segid == newsegid):
                raise RuntimeError(
                    f"Segid {newsegid} already exists in the molecule. Please choose different prefix."
                )
            if _logger:
                logger.info(
                    f"Created segment {newsegid} between resid {mol.resid[s]} and {mol.resid[e]}."
                )
            mol.segid[s : e + 1] = newsegid
        # Fixup chain
        if field in ["chain", "both"]:
            newchainid = available_chains[i]
            if _logger:
                logger.info(
                    f"Set chain {newchainid} between resid {mol.resid[s]} and {mol.resid[e]}."
                )
            mol.chain[s : e + 1] = newchainid

        i += 1

    return mol
def sequenceStructureAlignment(mol,
                               ref,
                               molseg=None,
                               refseg=None,
                               maxalignments=10,
                               nalignfragment=1):
    """ Aligns two structures by their longests sequences alignment

    Parameters
    ----------
    mol : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The Molecule we want to align
    ref : :class:`Molecule <moleculekit.molecule.Molecule>` object
        The reference Molecule to which we want to align
    molseg : str
        The segment of `mol` we want to align
    refseg : str
        The segment of `ref` we want to align to
    maxalignments : int
        The maximum number of alignments we want to produce
    nalignfragment : int
        The number of fragments used for the alignment.

    Returns
    -------
    mols : list
        A list of Molecules each containing a different alignment.
    """
    from moleculekit.util import ensurelist
    try:
        from Bio import pairwise2
    except ImportError as e:
        raise ImportError(
            'You need to install the biopython package to use this function. Try using `conda install biopython`.'
        )
    from Bio.SubsMat import MatrixInfo as matlist

    if len([x for x in np.unique(mol.altloc) if len(x)]) > 1:
        raise RuntimeError(
            'Alternative atom locations detected in `mol`. Please remove these before calling this function.'
        )
    if len([x for x in np.unique(ref.altloc) if len(x)]) > 1:
        raise RuntimeError(
            'Alternative atom locations detected in `ref`. Please remove these before calling this function.'
        )

    seqmol = mol.sequence()
    seqref = ref.sequence()

    if molseg is None and len(seqmol) > 1:
        logger.info(
            'Multiple segments ({}) detected in `mol`. Alignment will be done on all. Otherwise please specify which segment to align.'
            .format(list(seqmol.keys())))
        seqmol = mol.sequence(noseg=True)
        molseg = list(seqmol.keys())[0]
    if refseg is None and len(seqref) > 1:
        logger.info(
            'Multiple segments ({}) detected in `ref`. Alignment will be done on all. Otherwise please specify which segment to align.'
            .format(list(seqref.keys())))
        seqref = ref.sequence(noseg=True)
        refseg = list(seqref.keys())[0]

    def getSegIdx(m, mseg):
        # Calculate the atoms which belong to the selected segments
        if isinstance(mseg, str) and mseg == 'protein':
            msegidx = m.atomselect('protein and name CA')
        else:
            msegidx = np.zeros(m.numAtoms, dtype=bool)
            for seg in ensurelist(mseg):
                msegidx |= (m.segid == seg) & (m.name == 'CA')
        return np.where(msegidx)[0]

    molsegidx = getSegIdx(mol, molseg)
    refsegidx = getSegIdx(ref, refseg)

    # Create fake residue numbers for the selected segment
    from moleculekit.util import sequenceID
    molfakeresid = sequenceID(
        (mol.resid[molsegidx], mol.insertion[molsegidx], mol.chain[molsegidx]))
    reffakeresid = sequenceID(
        (ref.resid[refsegidx], ref.insertion[refsegidx], ref.chain[refsegidx]))

    # TODO: Use BLOSUM62?
    alignments = pairwise2.align.globaldx(seqref[refseg], seqmol[molseg],
                                          matlist.blosum62)
    numaln = len(alignments)

    if numaln > maxalignments:
        logger.warning(
            '{} alignments found. Limiting to {} as specified in the `maxalignments` argument.'
            .format(numaln, maxalignments))

    alignedstructs = []
    for i in range(min(maxalignments, numaln)):
        refaln = np.array(list(alignments[i][0]))
        molaln = np.array(list(alignments[i][1]))

        # By doing cumsum we calculate how many letters were before the current letter (i.e. residues before current)
        residref = np.cumsum(refaln != '-') - 1  # Start them from 0
        residmol = np.cumsum(molaln != '-') - 1  # Start them from 0

        # Find the region of maximum alignment between the molecules
        dsig = np.hstack(
            ([False], (refaln != '-') & (molaln != '-'), [False])).astype(int)
        dsigdiff = np.diff(dsig)
        startIndex = np.where(dsigdiff > 0)[0]
        endIndex = np.where(dsigdiff < 0)[0]
        duration = endIndex - startIndex
        duration_sorted = np.sort(duration)[::-1]

        _list_starts = []
        _list_finish = []
        for n in range(nalignfragment):
            if n == len(duration):
                break
            idx = np.where(duration == duration_sorted[n])[0]
            start = startIndex[idx][0]
            finish = endIndex[idx][0]
            _list_starts.append(start)
            _list_finish.append(finish)

        # Get the "resids" of the aligned residues only
        refalnresid = np.concatenate([
            residref[start:finish]
            for start, finish in zip(_list_starts, _list_finish)
        ])
        molalnresid = np.concatenate([
            residmol[start:finish]
            for start, finish in zip(_list_starts, _list_finish)
        ])
        refidx = []
        for r in refalnresid:
            refidx += list(refsegidx[reffakeresid == r])
        molidx = []
        for r in molalnresid:
            molidx += list(molsegidx[molfakeresid == r])

        molboolidx = np.zeros(mol.numAtoms, dtype=bool)
        molboolidx[molidx] = True
        refboolidx = np.zeros(ref.numAtoms, dtype=bool)
        refboolidx[refidx] = True

        start_residues = np.concatenate([
            mol.resid[molsegidx[molfakeresid == residmol[r]]]
            for r in _list_starts
        ])
        finish_residues = np.concatenate([
            mol.resid[molsegidx[molfakeresid == residmol[r - 1]]]
            for r in _list_finish
        ])
        logger.info(
            'Alignment #{} was done on {} residues: mol segid {} resid {}'.
            format(
                i, len(refalnresid),
                np.unique(mol.segid[molidx])[0], ', '.join([
                    '{}-{}'.format(s, f)
                    for s, f in zip(start_residues, finish_residues)
                ])))

        alignedmol = mol.copy()
        alignedmol.align(molboolidx, ref, refboolidx)
        alignedstructs.append(alignedmol)

    return alignedstructs