def calculateVariables(currmol): res = sequenceID((currmol.resid, currmol.insertion, currmol.segid, currmol.chain)) caidx = currmol.name == 'CA' res = np.unique(res) reslen = len(res) # Calculate the protein sequence seq = ''.join([_residueNameTable[x] for x in currmol.resname[caidx]]) seq = ct.c_char_p(seq.encode('utf-8')) # Keep only CA coordinates coords = currmol.coords[caidx, :, :].copy() return reslen, res.astype(np.int32), seq, coords
def _viewStatesNGL(self, states, statetype, protein, ligand, mols, numsamples, gui=False): from htmd.builder.builder import sequenceID if states is None: states = range(self.macronum) if isinstance(states, int): states = [states] if mols is None: mols = self.getStates(states, statetype, numsamples=min(numsamples, 15)) colors = [0, 1, 3, 4, 5, 6, 7, 9] hexcolors = {0: '#0000ff', 1: '#ff0000', 2: '#333333', 3: '#ff6600', 4: '#ffff00', 5: '#4c4d00', 6: '#b2b2cc', 7: '#33cc33', 8: '#ffffff', 9: '#ff3399', 10: '#33ccff'} if protein is None and ligand is None: raise NameError('Please provide either the "protein" or "ligand" parameter for viewStates.') k = 0 from nglview import NGLWidget, HTMDTrajectory view = NGLWidget(gui=gui) ref = mols[0].copy() for i, s in enumerate(states): if protein: mol = Molecule() if ligand: mol = ref.copy() mol.remove(ligand, _logger=False) mol.coords = np.atleast_3d(mol.coords[:, :, 0]) mols[i].filter(ligand, _logger=False) mols[i].set('chain', '{}'.format(s)) tmpcoo = mols[i].coords for j in range(mols[i].numFrames): mols[i].coords = np.atleast_3d(tmpcoo[:, :, j]) if ligand: mols[i].set('segid', sequenceID(mols[i].resid)+k) k = int(mols[i].segid[-1]) mol.append(mols[i]) view.add_trajectory(HTMDTrajectory(mol)) # Setting up representations if ligand: view[i].add_cartoon('protein', color='sstruc') view[i].add_hyperball(':{}'.format(s), color=hexcolors[np.mod(i, len(hexcolors))]) if protein: view[i].add_cartoon('protein', color='residueindex') self._nglButtons(view, statetype, states) return view
def sequenceStructureAlignment(mol, ref, molseg=None, refseg=None, maxalignments=10, nalignfragment=1): """ Aligns two structures by their longests sequences alignment Parameters ---------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The Molecule we want to align ref : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The reference Molecule to which we want to align molseg : str The segment of `mol` we want to align refseg : str The segment of `ref` we want to align to maxalignments : int The maximum number of alignments we want to produce nalignfragment : int The number of fragments used for the alignment. Returns ------- mols : list A list of Molecules each containing a different alignment. """ from htmd.util import ensurelist try: from Bio import pairwise2 except ImportError as e: raise ImportError( 'You need to install the biopython package to use this function. Try using `conda install biopython`.' ) from Bio.SubsMat import MatrixInfo as matlist if len([x for x in np.unique(mol.altloc) if len(x)]) > 1: raise RuntimeError( 'Alternative atom locations detected in `mol`. Please remove these before calling this function.' ) if len([x for x in np.unique(ref.altloc) if len(x)]) > 1: raise RuntimeError( 'Alternative atom locations detected in `ref`. Please remove these before calling this function.' ) seqmol = mol.sequence() seqref = ref.sequence() if len(seqmol) > 1: logger.info( 'Multiple segments ({}) detected in `mol`. Alignment will be done on all. Otherwise please specify which segment to align.' .format(list(seqmol.keys()))) seqmol = mol.sequence(noseg=True) if len(seqref) > 1: logger.info( 'Multiple segments ({}) detected in `ref`. Alignment will be done on all. Otherwise please specify which segment to align.' .format(list(seqref.keys()))) seqref = ref.sequence(noseg=True) if molseg is None: molseg = list(seqmol.keys())[0] if refseg is None: refseg = list(seqref.keys())[0] def getSegIdx(m, mseg): # Calculate the atoms which belong to the selected segments if isinstance(mseg, str) and mseg == 'protein': msegidx = m.atomselect('protein and name CA') else: msegidx = np.zeros(m.numAtoms, dtype=bool) for seg in ensurelist(mseg): msegidx |= (m.segid == seg) & (m.name == 'CA') return np.where(msegidx)[0] molsegidx = getSegIdx(mol, molseg) refsegidx = getSegIdx(ref, refseg) # Create fake residue numbers for the selected segment molfakeresid = sequenceID( (mol.resid[molsegidx], mol.insertion[molsegidx], mol.chain[molsegidx])) reffakeresid = sequenceID( (ref.resid[refsegidx], ref.insertion[refsegidx], ref.chain[refsegidx])) # TODO: Use BLOSUM62? alignments = pairwise2.align.globaldx(seqref[refseg], seqmol[molseg], matlist.blosum62) numaln = len(alignments) if numaln > maxalignments: logger.warning( '{} alignments found. Limiting to {} as specified in the `maxalignments` argument.' .format(numaln, maxalignments)) alignedstructs = [] for i in range(min(maxalignments, numaln)): refaln = np.array(list(alignments[i][0])) molaln = np.array(list(alignments[i][1])) # By doing cumsum we calculate how many letters were before the current letter (i.e. residues before current) residref = np.cumsum(refaln != '-') - 1 # Start them from 0 residmol = np.cumsum(molaln != '-') - 1 # Start them from 0 # Find the region of maximum alignment between the molecules dsig = np.hstack( ([False], (refaln != '-') & (molaln != '-'), [False])).astype(int) dsigdiff = np.diff(dsig) startIndex = np.where(dsigdiff > 0)[0] endIndex = np.where(dsigdiff < 0)[0] duration = endIndex - startIndex duration_sorted = np.sort(duration)[::-1] _list_starts = [] _list_finish = [] for n in range(nalignfragment): if n == len(duration): break idx = np.where(duration == duration_sorted[n])[0] start = startIndex[idx][0] finish = endIndex[idx][0] _list_starts.append(start) _list_finish.append(finish) # Get the "resids" of the aligned residues only refalnresid = np.concatenate([ residref[start:finish] for start, finish in zip(_list_starts, _list_finish) ]) molalnresid = np.concatenate([ residmol[start:finish] for start, finish in zip(_list_starts, _list_finish) ]) refidx = [] for r in refalnresid: refidx += list(refsegidx[reffakeresid == r]) molidx = [] for r in molalnresid: molidx += list(molsegidx[molfakeresid == r]) molboolidx = np.zeros(mol.numAtoms, dtype=bool) molboolidx[molidx] = True refboolidx = np.zeros(ref.numAtoms, dtype=bool) refboolidx[refidx] = True start_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r]]] for r in _list_starts ]) finish_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r - 1]]] for r in _list_finish ]) logger.info( 'Alignment #{} was done on {} residues: mol segid {} resid {}'. format( i, len(refalnresid), np.unique(mol.segid[molidx])[0], ', '.join([ '{}-{}'.format(s, f) for s, f in zip(start_residues, finish_residues) ]))) alignedmol = mol.copy() alignedmol.align(molboolidx, ref, refboolidx) alignedstructs.append(alignedmol) return alignedstructs
def sequenceStructureAlignment(mol, ref, molseg=None, refseg=None, maxalignments=10, nalignfragment=1): """ Aligns two structures by their longests sequences alignment Parameters ---------- mol : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The Molecule we want to align ref : :class:`Molecule <htmd.molecule.molecule.Molecule>` object The reference Molecule to which we want to align molseg : str The segment of `mol` we want to align refseg : str The segment of `ref` we want to align to maxalignments : int The maximum number of alignments we want to produce nalignfragment : int The number of fragments used for the alignment. Returns ------- mols : list A list of Molecules each containing a different alignment. """ from htmd.util import ensurelist try: from Bio import pairwise2 except ImportError as e: raise ImportError('You need to install the biopython package to use this function. Try using `conda install biopython`.') from Bio.SubsMat import MatrixInfo as matlist if len([x for x in np.unique(mol.altloc) if len(x)]) > 1: raise RuntimeError('Alternative atom locations detected in `mol`. Please remove these before calling this function.') if len([x for x in np.unique(ref.altloc) if len(x)]) > 1: raise RuntimeError('Alternative atom locations detected in `ref`. Please remove these before calling this function.') seqmol = mol.sequence() seqref = ref.sequence() if len(seqmol) > 1: logger.info('Multiple segments ({}) detected in `mol`. Alignment will be done on all. Otherwise please specify which segment to align.'.format(list(seqmol.keys()))) seqmol = mol.sequence(noseg=True) if len(seqref) > 1: logger.info('Multiple segments ({}) detected in `ref`. Alignment will be done on all. Otherwise please specify which segment to align.'.format(list(seqref.keys()))) seqref = ref.sequence(noseg=True) if molseg is None: molseg = list(seqmol.keys())[0] if refseg is None: refseg = list(seqref.keys())[0] def getSegIdx(m, mseg): # Calculate the atoms which belong to the selected segments if isinstance(mseg, str) and mseg == 'protein': msegidx = m.atomselect('protein and name CA') else: msegidx = np.zeros(m.numAtoms, dtype=bool) for seg in ensurelist(mseg): msegidx |= (m.segid == seg) & (m.name == 'CA') return np.where(msegidx)[0] molsegidx = getSegIdx(mol, molseg) refsegidx = getSegIdx(ref, refseg) # Create fake residue numbers for the selected segment molfakeresid = sequenceID((mol.resid[molsegidx], mol.insertion[molsegidx], mol.chain[molsegidx])) reffakeresid = sequenceID((ref.resid[refsegidx], ref.insertion[refsegidx], ref.chain[refsegidx])) # TODO: Use BLOSUM62? alignments = pairwise2.align.globaldx(seqref[refseg], seqmol[molseg], matlist.blosum62) numaln = len(alignments) if numaln > maxalignments: logger.warning('{} alignments found. Limiting to {} as specified in the `maxalignments` argument.'.format(numaln, maxalignments)) alignedstructs = [] for i in range(min(maxalignments, numaln)): refaln = np.array(list(alignments[i][0])) molaln = np.array(list(alignments[i][1])) # By doing cumsum we calculate how many letters were before the current letter (i.e. residues before current) residref = np.cumsum(refaln != '-') - 1 # Start them from 0 residmol = np.cumsum(molaln != '-') - 1 # Start them from 0 # Find the region of maximum alignment between the molecules dsig = np.hstack(([False], (refaln != '-') & (molaln != '-'), [False])).astype(int) dsigdiff = np.diff(dsig) startIndex = np.where(dsigdiff > 0)[0] endIndex = np.where(dsigdiff < 0)[0] duration = endIndex - startIndex duration_sorted = np.sort(duration)[::-1] _list_starts = [] _list_finish = [] for n in range(nalignfragment): if n == len(duration): break idx = np.where(duration == duration_sorted[n])[0] start = startIndex[idx][0] finish = endIndex[idx][0] _list_starts.append(start) _list_finish.append(finish) # Get the "resids" of the aligned residues only refalnresid = np.concatenate([ residref[start:finish] for start, finish in zip(_list_starts,_list_finish)]) molalnresid = np.concatenate([ residmol[start:finish] for start, finish in zip(_list_starts, _list_finish) ]) refidx = [] for r in refalnresid: refidx += list(refsegidx[reffakeresid == r]) molidx = [] for r in molalnresid: molidx += list(molsegidx[molfakeresid == r]) molboolidx = np.zeros(mol.numAtoms, dtype=bool) molboolidx[molidx] = True refboolidx = np.zeros(ref.numAtoms, dtype=bool) refboolidx[refidx] = True start_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r]]] for r in _list_starts]) finish_residues = np.concatenate([ mol.resid[molsegidx[molfakeresid == residmol[r-1]]] for r in _list_finish]) logger.info('Alignment #{} was done on {} residues: mol segid {} resid {}'.format( i, len(refalnresid), np.unique(mol.segid[molidx])[0], ', '.join(['{}-{}'.format(s,f) for s, f in zip(start_residues,finish_residues)]) )) alignedmol = mol.copy() alignedmol.align(molboolidx, ref, refboolidx) alignedstructs.append(alignedmol) return alignedstructs