def getLabel(self, full=False): """Returns label of the sequence.""" label = self._label if label is None: label = self._msa._labels[self._index] return (label if full else splitSeqLabel(label)[0]).strip()
def _map(self, mapping=None): labels = self._labels if mapping is not None: try: mapping['isdict'] except KeyError: pass except Exception: raise TypeError('mapping must be a dictionary') for key, value in mapping.items(): values = [value] if isscalar(value) else value for i in values: if not key in labels[i]: labels[i] = key self._mapping = mapping = {} for index, label in enumerate(labels): label = splitSeqLabel(label)[0] try: value = mapping[label] except KeyError: mapping[label] = index else: try: value.append(index) except AttributeError: mapping[label] = [value, index] return mapping
def getLabel(self, index, full=False): """Returns label of the sequence at given *index*. Residue numbers will be removed from the sequence label, unless *full* is **True**.""" index = self._mapping.get(index, index) if full: return self._labels[index] else: return splitSeqLabel(self._labels[index])[0]
def iterLabels(self, full=False): """Yield sequence labels. By default the part of the label used for indexing sequences is yielded.""" if full: for label in self._labels: yield label else: for label in self._labels: yield splitSeqLabel(label)[0]
def getResnums(self, gaps=False, report_match=False): """Returns list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label if possible. When the label does not contain residue number information, a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) match = False try: start, end = int(start), int(end) except: LOGGER.info( 'Cannot parse start and end values from sequence label {0}. Setting ' 'resnums 1 to {1:d}'.format(title, self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label {0} start-end entry does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {1:d}'.format( title, self.numResidues())) start, end = 1, self.numResidues() else: LOGGER.info('Label {0} start-end entry matches ' 'length of ungapped sequence. Setting ' 'resnums {1:d} to {2:d}'.format(title, start, end)) match = True resnums = iter(range(start, end + 1)) if gaps: result = [ next(resnums) if torf else None for torf in char.isalpha(self._array) ] else: result = list(resnums) if report_match: return match, result return result
def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, PY3K from prody import parsePDB, calcMSAOccupancy, trimAtomsUsingMSA from prody.utilities import openFile, splitSeqLabel from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label, msaflag = kwargs.get('msa'), kwargs.get('label'), False pdb, pdbflag = kwargs.get('pdb'), False resnum = None if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) try: if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, start, end = splitSeqLabel( msa[index].getLabel(True)) else: label, start, end = splitSeqLabel( msa[index].getLabel(True)) except: LOGGER.info('Could not extract resnums from MSA') else: msaflag = True if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: try: sel = trimAtomsUsingMSA(sel, msa, chain=chain.getChid()) if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info( 'Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break except: LOGGER.info( 'Number of residues in PDB does not match ' 'mutinfo matrix and no MSA was provided to ' 'align the PDB against, so ignoring PDB input') if not pdbflag: if msaflag: if (start and end is not None) and (start < end): resnum = np.arange(start, end + 1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format(label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info( 'Residue numbers will be based on MSA and label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0] + 1) LOGGER.info('Residue numbers start and end with {0}-{1}'.format( str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 if PY3K: mode = 'w' else: mode = 'wb' f = openFile(outname, mode) if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()
def refineMSA(msa, index=None, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg index: remove columns that are gaps in the sequence with that index :type index: int :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None if index is not None: before = arr.shape[1] LOGGER.timeit('_refine') cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) title.append('index=' + str(index)) LOGGER.report( 'Index refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if label is not None: if index is not None: LOGGER.info('An index was provided so the label will be ignored.') else: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError( 'failed to parse header for {0} ({1})'.format( label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] resnums = chain.ca.getResnums() if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError( 'label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report( 'Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from Bio import pairwise2 from prody.utilities import MATCH_SCORE, MISMATCH_SCORE from prody.utilities import GAP_PENALTY, GAP_EXT_PENALTY, ALIGNMENT_METHOD chseq = chain.getSequence() algn = pairwise2.align.localms(pystr( arr[index].tostring().upper()), pystr(chseq), MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) resnums = resnums.take(torf.nonzero()[0] - torf.nonzero()[0][0] + 1) LOGGER.report( 'Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.'.format( before, arr.shape[1]), '_refine') else: LOGGER.debug( 'All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) labels = msa._labels labels[index] = splitSeqLabel(labels[index])[0] + '/' + str( resnums[0]) + '-' + str(resnums[-1]) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report( 'Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report( 'Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report( 'Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError( 'label, index, seqid, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) else: labels = msa._labels labels = [labels[i] for i in rows] return MSA(arr, title=msa.getTitle() + ' refined ({0})'.format(', '.join(title)), labels=labels)
def getResnums(self, index): """Returns starting and ending residue numbers (:term:`resnum`) for the sequence at given *index*.""" index = self._mapping.get(index, index) return splitSeqLabel(self._labels[index])[1:]