def __or__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = unique(concatenate( (self._getIndices(), other._getIndices()))) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0}) or ({1})'.format(self.getSelstr(), other.getSelstr()), acsi, unique=True)
def calcTransformation(mobile, target, weights=None): """Returns a :class:`Transformation` instance which, when applied to the atoms in *mobile*, minimizes the weighted RMSD between *mobile* and *target*. *mobile* and *target* may be NumPy coordinate arrays, or :class:`.Atomic` instances, e.g. :class:`.AtomGroup`, :class:`.Chain`, or :class:`.Selection`.""" if not isinstance(mobile, np.ndarray): try: mob = mobile._getCoords() except AttributeError: raise TypeError('mobile must be a numpy array or an object ' 'with getCoords method') else: mob = mobile if not isinstance(target, np.ndarray): try: tar = target._getCoords() except AttributeError: raise TypeError('target must be a numpy array or an object ' 'with getCoords method') else: tar = target if mob.shape != tar.shape: raise ValueError('reference and target coordinate arrays ' 'must have same number of atoms') if mob.shape[1] != 3: raise ValueError('reference and target must be coordinate arrays') if weights is None: if isinstance(mobile, AtomMap): LOGGER.warn( 'mobile is an AtomMap instance, consider assign weights=mobile.getFlags("mapped") ' 'if there are dummy atoms in mobile') if isinstance(target, AtomMap): LOGGER.warn( 'target is an AtomMap instance, consider assign weights=target.getFlags("mapped") ' 'if there are dummy atoms in target') if weights is not None: weights = checkWeights(weights, mob.shape[0]) return Transformation(*getTransformation(mob, tar, weights))
def pathPDBFolder(folder=None, divided=False): """Returns or specify local PDB folder for storing PDB files downloaded from `wwPDB <http://www.wwpdb.org/>`_ servers. Files stored in this folder can be accessed via :func:`.fetchPDB` from any working directory. To release the current folder, pass an invalid path, e.g. ``folder=''``. If *divided* is **True**, the divided folder structure of wwPDB servers will be assumed when reading from and writing to the local folder. For example, a structure with identifier **1XYZ** will be present as :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`. If *divided* is **False**, a plain folder structure will be expected and adopted when saving files. For example, the same structure will be present as :file:`pdblocalfolder/1xyz.pdb.gz`. Finally, in either case, lower case letters will be used and compressed files will be stored.""" if folder is None: folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warn('PDB local folder {0} is not a accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder))) if divided: LOGGER.info('wwPDB divided folder structure will be assumed.') else: LOGGER.info('A plain folder structure will be assumed.') SETTINGS['pdb_local_folder'] = folder SETTINGS['pdb_local_divided'] = bool(divided) SETTINGS.save() else: current = SETTINGS.pop('pdb_local_folder') if current: LOGGER.info('PDB folder {0} is released.'.format( repr(current))) SETTINGS.pop('pdb_local_divided') SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(folder)))
def checkIdentifiers(*pdb): """Check whether *pdb* identifiers are valid, and replace invalid ones with **None** in place.""" identifiers = [] append = identifiers.append for pid in pdb: try: pid = pid.strip().lower() except AttributeError: LOGGER.warn('{0} is not a valid identifier.'.format(repr(pid))) append(None) else: if not (len(pid) == 4 and pid.isalnum()): LOGGER.warn('{0} is not a valid identifier.' .format(repr(pid))) append(None) else: append(pid) return identifiers
def addNonstdAminoacid(resname, *properties): """Add non-standard amino acid *resname* with *properties* selected from: * {props} .. ipython:: python addNonstdAminoacid('PTR', 'acidic', 'aromatic', 'cyclic', 'large', 'polar', 'surface') Default set of non-standard amino acids can be restored as follows: .. ipython:: python flagDefinition(reset='nonstdaa')""" resname = str(resname) if len(resname) > 4: LOGGER.warn('Residue name {0} is unusually long.'.format( repr(resname))) propset = set(properties) for cat, val in CATEGORIES.items(): intersection = val.intersection(propset) if intersection: if len(intersection) > 1: raise ValueError('amino acid properties {0} cannot be ' 'present together'.format(', '.join( [repr(prp) for prp in intersection]))) for prop in intersection: propset.remove(prop) if propset: raise ValueError('amino acid property {0} is not valid'.format( repr(propset.pop()))) nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) nonstd[resname] = set(properties) updateNonstandard(nonstd)
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(pdb, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) ftp.quit() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def parseImagesFromSTAR(particlesSTAR, **kwargs): """ Parses particle images using data from a STAR file containing information about them. :arg particlesSTAR: a filename for a STAR file. :type particlesSTAR: str :arg block_indices: indices for data blocks containing rows corresponding to images of interest The indexing scheme is similar to that for numpy arrays. Default behavior is use all data blocks about images :type block_indices: list, :class:`~numpy.ndarray` :arg row_indices: indices for rows corresponding to images of interest The indexing scheme is similar to that for numpy arrays. row_indices should be a 1D or 2D array-like. 2D row_indices should contain an entry for each relevant loop. If a 1D array-like is given the same row indices will be applied to all loops. Default behavior is to use all rows about images :type row_indices: list, :class:`~numpy.ndarray` :arg particle_indices: indices for particles regardless of STAR structure default is take all particles Please note: this acts after block_indices and row_indices :type particle_indices: list, :class"`~numpy.ndarray` :arg saveImageArrays: whether to save the numpy array for each image to file default is False :type saveImageArrays: bool :arg saveDirectory: directory where numpy image arrays are saved default is None, which means save to the current working directory :type saveDirectory: str, None :arg rotateImages: whether to apply in plane translations and rotations using provided psi and origin data, default is True :type rotateImages: bool """ try: from skimage.transform import rotate except ImportError: raise ImportError('This function requires scikit-image.') block_indices = kwargs.get('block_indices', None) # No loop_indices because data blocks about particle images contain 1 loop row_indices = kwargs.get('row_indices', None) particle_indices = kwargs.get('particle_indices', None) saveImageArrays = kwargs.get('saveImageArrays', False) saveDirectory = kwargs.get('saveDirectory', None) rotateImages = kwargs.get('rotateImages', True) try: particlesSTAR = parseSTAR(particlesSTAR) except: raise ValueError('particlesSTAR should be a filename for a STAR file') # Check dimensions/contents of particlesSTAR and generate full indices dataBlocks = [] loops = [] maxLoops = 0 maxRows = 0 dataBlock_goodness = [] for dataBlock in particlesSTAR: foundImageField = False for loop in dataBlock: if ('_image' in loop.fields) or ('_rlnImageName' in loop.fields): foundImageField = True loops.append(loop) if loop.numRows > maxRows: maxRows = loop.numRows else: dataBlock.pop(int(loop.getTitle().split(' ')[-1])) if dataBlock.numLoops > maxLoops: maxLoops = dataBlock.numLoops if foundImageField: dataBlocks.append(dataBlock) dataBlock_goodness.append(True) else: dataBlock_goodness.append(False) indices = np.zeros((len(dataBlocks), maxLoops, maxRows, 3), dtype=int) i = -1 for n, dataBlock in enumerate(particlesSTAR): if dataBlock_goodness[n]: i += 1 for j, loop in enumerate(dataBlock): for k in range(maxRows): if k < loop.numRows: indices[i, j, k] = np.array([n, j, k]) else: indices[i, j, k] = np.array([0, 0, 0]) dataBlocks = np.array(dataBlocks) loops = np.array(loops) # Convert keyword indices to valid indices if possible if block_indices is not None: if np.array_equal(dataBlocks, np.array([])): raise TypeError( 'particlesSTAR must have data blocks to use block_indices') try: block_indices = np.array(block_indices) except: raise TypeError('block_indices should be array-like') if block_indices.ndim != 1: raise ValueError( 'block_indices should be a 1-dimensional array-like') for i, index in enumerate(list(reversed(block_indices))): try: block = particlesSTAR[index] if not isinstance(block, StarDataBlock): LOGGER.warn( 'There is no block corresponding to block_index {0}. ' 'This index has been removed.'.format( block_indices.shape[0] - i - 1)) block_indices = np.delete(block_indices, i, 0) except: LOGGER.warn( 'There is no block corresponding to block_index {0}. ' 'This index has been removed.'.format( block_indices.shape[0] - i - 1)) block_indices = np.delete(block_indices, i, 0) if not np.array_equal(block_indices, np.array([])): indices = np.concatenate(([ indices[np.where(indices[:, 0, 0, 0] == item)] for item in block_indices ]), axis=0) else: LOGGER.warn( 'None of the block_indices corresponded to dataBlocks. ' 'Default block indices corresponding to all dataBlocks ' 'will be used instead.') dataBlocks = particlesSTAR[block_indices] if row_indices is not None: try: row_indices = np.array(row_indices) except: raise TypeError('row_indices should be array-like') if row_indices.ndim == 1: if isinstance(row_indices[0], int): # row_indices provided was truly 1D so # we will use same row indices for all data blocks # and warn the user we are doing so if len(dataBlocks) != 1: LOGGER.warn( 'row_indices is 1D but there are multiple data blocks ' 'so the same row indices will be used for each') row_indices = np.array( [row_indices for i in range(len(dataBlocks))]) # This also works if len(dataBlocks) == 1 elif isinstance(row_indices[0], (list, tuple)): # A list-like of list-likes of different sizes was provided # We turn it into a proper 2D array by filling the short # list likes with zeros if len(row_indices) != len(dataBlocks): raise ValueError( 'There should be an entry in row indices for ' 'each data block') max_len = 0 for entry in row_indices: if not np.isscalar(entry): if len(entry) > max_len: max_len = len(entry) row_indices_list_entries = [] for entry in row_indices: if isinstance(entry, int): list_entry = [entry] else: list_entry = list(entry) while len(list_entry) < max_len: list_entry.append(0) row_indices_list_entries.append(list_entry) row_indices = np.array(row_indices_list_entries) elif row_indices.ndim == 2: # A list-like of list-likes of the same size was provided if row_indices.shape[0] != len(dataBlocks): if len(row_indices) == 1: # we will use same row indices for all data blocks # and warn the user we are doing so if len(dataBlocks) != 1: LOGGER.warn( 'row_indices has one entry but there are multiple data blocks ' 'so the same row indices will be used for each') row_indices = np.array( [row_indices[0] for i in range(len(dataBlocks))]) # This also works if len(dataBlocks) == 1 else: raise ValueError( 'There should be an entry in row indices for ' 'each data block') else: raise ValueError( 'row_indices should be 1D or 2D array-like objects') # indices need updating good_indices_list = [] for i, index_i in enumerate(indices): good_indices_list.append([]) for j, index_j in enumerate(index_i): good_indices_list[i].append([]) for r, index_r in enumerate(row_indices[i]): for k, index_k in enumerate(index_j): if k == index_r: if not (r != 0 and index_r == 0): good_indices_list[i][j].append(index_k) else: good_indices_list[i][j].append( np.array([0, 0, 0])) indices = np.array(good_indices_list) if indices is np.array([]): raise ValueError( 'selection does not contain any rows with image fields') # Use indices to collect particle data dictionaries particles = [] for i, index_i in enumerate(indices): for j, index_j in enumerate(index_i): for k, index_k in enumerate(index_j): if not (np.array_equal(index_k, np.array([0, 0, 0])) and not (i == 0 and j == 0 and k == 0)): particles.append( particlesSTAR[index_k[0]][index_k[1]][index_k[2]]) if particle_indices is None: particle_indices = list(range(len(particles))) # Parse images using particle dictionaries image_stacks = dict() images = [] parsed_images_data = [] stk_images = [] if particlesSTAR._prog == 'XMIPP': imageFieldKey = '_image' else: imageFieldKey = '_rlnImageName' for i in particle_indices: particle = particles[i] try: image_field = particle[imageFieldKey] image_index = int(image_field.split('@')[0]) - 1 filename = image_field.split('@')[1] except: raise ValueError( 'particlesSTAR does not contain data about particle image ' '{0} location in either RELION or XMIPP format'.format(i)) if filename.endswith('.stk'): stk_images.append(str(i)) continue if not filename in list(image_stacks.keys()): image_stacks[filename] = parseEMD(filename).density image = image_stacks[filename][image_index] parsed_images_data.append(image_field) if saveImageArrays: if saveDirectory is not None: np.save('{0}/{1}'.format(saveDirectory, i), image) else: np.save('{1}'.format(i), image) if rotateImages: if particlesSTAR._prog == 'RELION': anglePsi = float(particle['_rlnAnglePsi']) originX = float(particle['_rlnOriginX']) originY = float(particle['_rlnOriginY']) elif particlesSTAR._prog == 'XMIPP': anglePsi = float(particle['_anglePsi']) originX = float(particle['_shiftX']) originY = float(particle['_shiftY']) images.append( rotate(image, anglePsi, center=(float(image.shape[0]) - originX, float(image.shape[1]) - originY))) else: images.append(image) if len(stk_images) > 0: LOGGER.warn( 'ProDy currently cannot parse images from XMIPP .stk files. ' 'Please be aware that images {0} and {1} will be missing ' 'from the final array.'.format(', '.join(stk_images[:-1]), stk_images[-1])) return np.array(images), parsed_images_data
def parseMMCIFStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or a class:`.StarDict` containing header data parsed from a stream of CIF lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') subset = kwargs.get('subset') chain = kwargs.get('chain') altloc = kwargs.get('altloc', 'A') header = kwargs.get('header', False) if model is not None: if isinstance(model, int): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid' .format(str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix ag = None if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header: ag, header = _parseMMCIFLines(ag, lines, model, chain, subset, altloc, header) else: ag = _parseMMCIFLines(ag, lines, model, chain, subset, altloc, header) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') if header: return ag, StarDict(*header, title=str(kwargs.get('title', 'Unknown'))) return ag
def refineEnsemble(ensemble, lower=.5, upper=10., **kwargs): """Refine a :class:`.PDBEnsemble` based on RMSD criterions. :arg ensemble: the ensemble to be refined :type ensemble: :class:`.Ensemble`, :class:`.PDBEnsemble` :arg lower: the smallest allowed RMSD between two conformations with the exception of **protected** :type lower: float :arg upper: the highest allowed RMSD between two conformations with the exception of **protected** :type upper: float :keyword protected: a list of either the indices or labels of the conformations needed to be kept in the refined ensemble :type protected: list :arg ref: the index or label of the reference conformation which will also be kept. Default is 0 :type ref: int or str """ protected = kwargs.pop('protected', []) P = [] if len(protected): labels = ensemble.getLabels() for p in protected: if isinstance(p, Integral): i = p else: if p in labels: i = labels.index(p) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(p)) P.append(i) LOGGER.timeit('_prody_refineEnsemble') from numpy import argsort ### obtain reference index # rmsd = ensemble.getRMSDs() # ref_i = np.argmin(rmsd) ref_i = kwargs.pop('ref', 0) if isinstance(ref_i, Integral): pass elif isinstance(ref_i, str): labels = ensemble.getLabels() ref_i = labels.index(ref_i) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(ref_i)) if not ref_i in P: P = [ref_i] + P ### calculate pairwise RMSDs ### RMSDs = ensemble.getRMSDs(pairwise=True) def getRefinedIndices(A): deg = A.sum(axis=0) sorted_indices = list(argsort(deg)) # sorted_indices = P + [x for x in sorted_indices if x not in P] sorted_indices.remove(ref_i) sorted_indices.insert(0, ref_i) n_confs = ensemble.numConfs() isdel_temp = np.zeros(n_confs) for a in range(n_confs): i = sorted_indices[a] for b in range(n_confs): if a >= b: continue j = sorted_indices[b] if isdel_temp[i] or isdel_temp[j]: continue else: if A[i, j]: # isdel_temp[j] = 1 if not j in P: isdel_temp[j] = 1 elif not i in P: isdel_temp[i] = 1 temp_list = isdel_temp.tolist() ind_list = [] for i in range(n_confs): if not temp_list[i]: ind_list.append(i) return ind_list L = list(range(len(ensemble))) U = list(range(len(ensemble))) if lower is not None: A = RMSDs < lower L = getRefinedIndices(A) if upper is not None: B = RMSDs > upper U = getRefinedIndices(B) # find common indices from L and U I = list(set(L) - (set(L) - set(U))) # for p in P: # if p not in I: # I.append(p) I.sort() reens = ensemble[I] LOGGER.report('Ensemble was refined in %.2fs.', '_prody_refineEnsemble') LOGGER.info('%d conformations were removed from ensemble.' % (len(ensemble) - len(I))) return reens
def buildBiomolecules(header, atoms, biomol=None): """Returns *atoms* after applying biomolecular transformations from *header* dictionary. Biomolecular transformations are applied to all coordinate sets in the molecule. Some PDB files contain transformations for more than 1 biomolecules. A specific set of transformations can be choosen using *biomol* argument. Transformation sets are identified by numbers, e.g. ``"1"``, ``"2"``, ... If multiple biomolecular transformations are provided in the *header* dictionary, biomolecules will be returned as :class:`.AtomGroup` instances in a :func:`list`. If the resulting biomolecule has more than 26 chains, the molecular assembly will be split into multiple :class:`.AtomGroup` instances each containing at most 26 chains. These :class:`.AtomGroup` instances will be returned in a tuple. Note that atoms in biomolecules are ordered according to chain identifiers. """ if not isinstance(header, dict): raise TypeError('header must be a dictionary') if not isinstance(atoms, Atomic): raise TypeError('atoms must be an Atomic instance') biomt = header.get('biomoltrans') if not isinstance(biomt, dict) or len(biomt) == 0: LOGGER.warn( "no biomolecular transformations found so original structure was used" ) return atoms if not isinstance(atoms, AtomGroup): atoms = atoms.copy() biomols = [] if biomol is None: keys = list(biomt) else: biomol = str(biomol) if biomol in biomt: keys = [biomol] else: LOGGER.warn('Transformations for biomolecule {0} was not ' 'found in the header dictionary.'.format(biomol)) return None keys.sort() for i in keys: segnm = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ' * 20) ags = [] mt = biomt[i] # mt is a list, first item is list of chain identifiers # following items are lines corresponding to transformation # mt must have 3n + 1 lines if (len(mt)) % 4 != 0: LOGGER.warn('Biomolecular transformations {0} were not ' 'applied'.format(i)) continue for times in range(int((len(mt)) / 4)): rotation = np.zeros((3, 3)) translation = np.zeros(3) line0 = np.fromstring(mt[times * 4 + 1], sep=' ') rotation[0, :] = line0[:3] translation[0] = line0[3] line1 = np.fromstring(mt[times * 4 + 2], sep=' ') rotation[1, :] = line1[:3] translation[1] = line1[3] line2 = np.fromstring(mt[times * 4 + 3], sep=' ') rotation[2, :] = line2[:3] translation[2] = line2[3] t = Transformation(rotation, translation) newag = atoms.select('chain ' + ' '.join(mt[times * 4 + 0])).copy() if newag is None: continue newag.all.setSegnames(segnm.pop(0)) for acsi in range(newag.numCoordsets()): newag.setACSIndex(acsi) newag = t.apply(newag) newag.setACSIndex(0) ags.append(newag) if ags: newag = ags.pop(0) while ags: newag += ags.pop(0) newag.setTitle('{0} biomolecule {1}'.format(atoms.getTitle(), i)) biomols.append(newag) if biomols: if len(biomols) == 1: return biomols[0] else: return biomols else: return None
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working directory, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed into *folder*. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaHTTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug( 'PDB file is found in working directory ({0}).'.format( sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug( 'PDB file is found in the local folder ({0}).'.format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None tp = kwargs.pop('tp', None) if tp is not None: tp = tp.lower() if tp == 'http': try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP failed ' '({0}).'.format(str(err))) elif tp == 'ftp': try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ' '({0}).'.format(str(err))) else: tryHTTP = False try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: tryHTTP = True if fns is None or isinstance(fns, list) and None in fns: tryHTTP = True elif isinstance(fns, list): downloads = [ not_found[i][1] for i in range(len(fns)) if fns[i] is None ] if len(downloads) > 0: tryHTTP = True if tryHTTP: LOGGER.info('Downloading PDB files via FTP failed, ' 'trying HTTP.') try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def copy(self): """Returns a copy of atoms (and atomic data) in an :class:`.AtomGroup` instance.""" dummies = None indices = None readonly = False try: ag = self.getAtomGroup() except AttributeError: ag = self readonly = True new = AtomGroup(ag.getTitle()) else: indices = self.getIndices() new = AtomGroup(ag.getTitle() + ' ' + str(self)) try: dummies = self.numDummies() except AttributeError: pass else: if dummies: dummy = self.getFlags('dummy') mapped = self.getFlags('mapped') try: self.getIndex() except AttributeError: this = self else: this = self.all if self.numCoordsets(): new.setCoords(this.getCoordsets(), label=ag.getCSLabels()) for label in ag.getDataLabels(): if label in READONLY: if readonly: new._data[label] = this.getData(label) else: new.setData(label, this.getData(label)) #if readonly: # for label in READONLY: # data = this.getData(label) # if data is not None: # new._data[label] = data skip_flags = set() for label in ag.getFlagLabels(): if label in skip_flags: continue else: new._setFlags(label, this.getFlags(label)) skip_flags.update(flags.ALIASES.get(label, [label])) if dummies: new._setFlags('dummy', dummy) new._setFlags('mapped', mapped) bonds = ag._bonds bmap = ag._bmap if bonds is not None and bmap is not None: if indices is None: new._bonds = bonds.copy() new._bmap = bmap.copy() new._data['numbonds'] = ag._data['numbonds'].copy() elif dummies: if dummies: indices = indices[self._getMapping()] if len(set(indices)) == len(indices): new.setBonds(trimBonds(bonds, indices)) else: LOGGER.warn('Duplicate atoms in mapping, bonds are ' 'not copied.') else: bonds = trimBonds(bonds, indices) if bonds is not None: new.setBonds(bonds) return new
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not filename.lower().endswith('.dcd'): filename += '.dcd' if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory'.format( type(trajectory))) irange = list( range(*slice(start, stop, step).indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev prev = i if isTrajectory: if diff > 1: trajectory.skip(diff - 1) frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:])) uc = uc[[0, 3, 1, 4, 5, 2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, label='_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.finish() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info( '{0} coordinate sets written at output rate {1} frame/s.'.format( n_csets, int(n_csets / time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.'.format( n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from caviar.prody_parser import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: folder = None path = None url = ('http://files.rcsb.org/ligands/download/{0}' '.xml'.format(cci.upper())) if not xml: try: inp = openURL(url) except IOError: raise IOError( 'XML file for ligand {0} is not found online'.format(cci)) else: xml = inp.read() if PY3K: xml = xml.decode() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}') + 1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [ (audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory')) ] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def _getPolymers(lines): """Returns list of polymers (macromolecules).""" pdbid = lines['pdbid'] polymers = dict() for i, line in lines['SEQRES']: ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.sequence += ''.join(getSequence(line[19:].split())) for i, line in lines['DBREF ']: i += 1 ch = line[12] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('DBREF chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.accession = line[33:41].strip() dbref.idcode = line[42:54].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.first = (first, line[18], int(line[56:60])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[62:67])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) dbref1 = lines['DBREF1'] dbref2 = lines['DBREF2'] if len(dbref1) != len(dbref2): LOGGER.warn('DBREF1 and DBREF1 records are not complete') dbref12 = [] else: dbref12 = zip(dbref1, dbref2) # PY3K: OK for dbref1, dbref2 in dbref12: i, line = dbref1 i += 1 ch = line[12] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.idcode = line[47:67].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) i, line = dbref2 i += 1 if line[12] == ' ': LOGGER.warn('DBREF2 chain identifier is not specified ' '({0}:{1})'.format(pdbid, ch)) elif line[12] != ch: LOGGER.warn('DBREF1 and DBREF2 chain id mismatch' '({0}:{1})'.format(pdbid, ch)) dbref.accession = line[18:40].strip() try: dbref.first = (first, line[18].strip(), int(line[45:55])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[57:67])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) for poly in polymers.values(): # PY3K: OK resnum = [] for dbref in poly.dbrefs: dbabbr = dbref.dbabbr if dbabbr == 'PDB': if not (pdbid == dbref.accession == dbref.idcode): LOGGER.warn('DBREF for chain {2} refers to PDB ' 'entry {3} ({0}:{1})'.format( pdbid, i, ch, dbref.accession)) else: if pdbid == dbref.accession or pdbid == dbref.idcode: LOGGER.warn('DBREF for chain {2} is {3}, ' 'expected PDB ({0}:{1})'.format( pdbid, i, ch, dbabbr)) dbref.database = 'PDB' resnum.append((dbref.first[0], dbref.last[0])) resnum.sort() last = -10000 for first, temp in resnum: if first <= last: LOGGER.warn('DBREF records overlap for chain {0} ({1})'.format( poly.chid, pdbid)) last = temp for i, line in lines['MODRES']: ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly if poly.modified is None: poly.modified = [] poly.modified.append( (line[12:15].strip(), line[18:22].strip() + line[22].strip(), line[24:27].strip(), line[29:70].strip())) for i, line in lines['SEQADV']: i += 1 ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly dbabbr = line[24:28].strip() resname = line[12:15].strip() try: resnum = int(line[18:22].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence ' # 'number ({0}:{1})'.format(pdbid, i, ch)) continue icode = line[22].strip() try: dbnum = int(line[43:48].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse database ' # 'sequence number ({0}:{1})'.format(pdbid, i, ch)) continue comment = line[49:70].strip() match = False for dbref in poly.dbrefs: if not dbref.first[0] <= resnum <= dbref.last[0]: continue match = True if dbref.dbabbr != dbabbr: LOGGER.warn('SEQADV for chain {2}: reference database ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.dbabbr), repr(dbabbr))) continue dbacc = line[29:38].strip() if dbref.accession[:9] != dbacc[:9]: LOGGER.warn('SEQADV for chain {2}: accession code ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.accession), repr(dbacc))) continue dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment)) if not match: LOGGER.warn('SEQADV for chain {2}: database sequence reference ' 'not found ({0}:{1})'.format(pdbid, i, ch)) continue string = ' '.join([line[10:].strip() for i, line in lines['COMPND']]) if string.startswith('MOL_ID'): dict_ = {} for molecule in string[6:].split('MOL_ID'): dict_.clear() for token in molecule.split(';'): token = token.strip() if not token: continue items = token.split(':', 1) if len(items) == 2: key, value = items dict_[key.strip()] = value.strip() chains = dict_.pop('CHAIN', '').strip() if not chains: continue for ch in chains.split(','): ch = ch.strip() poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.name = dict_.get('MOLECULE', '') poly.fragment = dict_.get('FRAGMENT', '') poly.comments = dict_.get('OTHER_DETAILS', '') val = dict_.get('SYNONYM', '') poly.synonyms = [s.strip() for s in val.split(',')] if val else [] val = dict_.get('EC', '') poly.ec = [s.strip() for s in val.split(',')] if val else [] poly.engineered = dict_.get('ENGINEERED', '') == 'YES' poly.mutation = dict_.get('MUTATION', '') == 'YES' return list(polymers.values())
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def buildPDBEnsemble(atomics, ref=None, title='Unknown', labels=None, unmapped=None, **kwargs): """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures (:class:`.Atomic` instances). Note that the reference should be included in the list as well. :arg atomics: a list of :class:`.Atomic` instances :type atomics: list :arg ref: reference structure or the index to the reference in *atomics*. If **None**, then the first item in *atomics* will be considered as the reference. If it is a :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets (**False**) of each structure should be added to the ensemble. Default is **True** :type degeneracy: bool :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an output argument :type unmapped: list :arg subset: a subset for selecting particular atoms from the input structures. Default is ``"all"`` :type subset: str :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to superpose the structures, otherwise conformations will be superposed with respect to the reference specified by *ref* unless set to ``False``. Default is ``'iter'`` :type superpose: str, bool """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'all')).lower() superpose = kwargs.pop('superpose', 'iter') superpose = kwargs.pop('iterpose', superpose) debug = kwargs.pop('debug', {}) if 'mapping_func' in kwargs: raise DeprecationWarning( 'mapping_func is deprecated. Please see release notes for ' 'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html' ) start = time.time() if not isListLike(atomics): raise TypeError('atomics should be list-like') if len(atomics) == 1 and degeneracy is True: raise ValueError('atomics should have at least two items') if labels is not None: if len(labels) != len(atomics): raise TypeError('Labels and atomics must have the same lengths.') else: labels = [] for atoms in atomics: if atoms is None: labels.append(None) else: labels.append(atoms.getTitle()) if ref is None: target = atomics[0] elif isinstance(ref, Integral): target = atomics[ref] elif isinstance(ref, PDBEnsemble): target = ref._atoms else: target = ref # initialize a PDBEnsemble with reference atoms and coordinates isrefset = False if isinstance(ref, PDBEnsemble): ensemble = ref else: # select the subset of reference beforehand for the sake of efficiency if subset != 'all': target = target.select(subset) ensemble = PDBEnsemble(title) if isinstance(target, Atomic): ensemble.setAtoms(target) ensemble.setCoords(target.getCoords()) isrefset = True else: ensemble._n_atoms = len(target) isrefset = False # build the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Building the ensemble...', len(atomics), '_prody_buildPDBEnsemble') for i, atoms in enumerate(atomics): if atoms is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % atoms.getTitle(), label='_prody_buildPDBEnsemble') try: atoms.getHierView() except AttributeError: raise TypeError( 'atomics must be a list of instances having the access to getHierView' ) if subset != 'all': atoms = atoms.select(subset) # find the mapping of chains of atoms to those of target debug[labels[i]] = {} atommaps = alignChains(atoms, target, debug=debug[labels[i]], **kwargs) if len(atommaps) == 0: unmapped.append(labels[i]) continue # add the atommaps to the ensemble for atommap in atommaps: lbl = pystr(labels[i]) if len(atommaps) > 1: chids = np.unique(atommap.getChids()) strchids = ''.join(chids) lbl += '_%s' % strchids ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) if not isrefset: ensemble.setCoords(atommap.getCoords()) isrefset = True LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose == 'iter': ensemble.iterpose() elif superpose is not False: ensemble.superpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format( ensemble.numConfs(), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def assignSecstr(header, atoms, coil=False): """Assign secondary structure from *header* dictionary to *atoms*. *header* must be a dictionary parsed using the :func:`.parsePDB`. *atoms* may be an instance of :class:`.AtomGroup`, :class:`.Selection`, :class:`.Chain` or :class:`.Residue`. ProDy can be configured to automatically parse and assign secondary structure information using ``confProDy(auto_secondary=True)`` command. See also :func:`.confProDy` function. The Dictionary of Protein Secondary Structure, in short DSSP, type single letter code assignments are used: * **G** = 3-turn helix (310 helix). Min length 3 residues. * **H** = 4-turn helix (alpha helix). Min length 4 residues. * **I** = 5-turn helix (pi helix). Min length 5 residues. * **T** = hydrogen bonded turn (3, 4 or 5 turn) * **E** = extended strand in parallel and/or anti-parallel beta-sheet conformation. Min length 2 residues. * **B** = residue in isolated beta-bridge (single pair beta-sheet hydrogen bond formation) * **S** = bend (the only non-hydrogen-bond based assignment). * **C** = residues not in one of above conformations. See http://en.wikipedia.org/wiki/Protein_secondary_structure#The_DSSP_code for more details. Following PDB helix classes are omitted: * Right-handed omega (2, class number) * Right-handed gamma (4) * Left-handed alpha (6) * Left-handed omega (7) * Left-handed gamma (8) * 2 - 7 ribbon/helix (9) * Polyproline (10) Secondary structures are assigned to all atoms in a residue. Amino acid residues without any secondary structure assignments in the header section will be assigned coil (C) conformation. This can be prevented by passing ``coil=False`` argument.""" if not isinstance(header, dict): raise TypeError('header must be a dictionary') helix = header.get('helix', {}) sheet = header.get('sheet', {}) if len(helix) == 0 and len(sheet) == 0: LOGGER.warn('header does not contain secondary structure data') return atoms ssa = atoms.getSecstrs() if ssa is None: if isinstance(atoms, AtomGroup): ag = atoms else: ag = atoms.getAtomGroup() ag.setSecstrs(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secondary'].dtype)) ag.setSecids(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secid'].dtype)) ag.setSecclasses( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secclass'].dtype)) ag.setSecindices( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secindex'].dtype)) prot = atoms.select('protein') if prot is not None: prot.setSecstrs('C') hierview = atoms.getHierView() count = 0 getResidue = hierview.getResidue for key, value in helix.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs(mapHelix[value[0]]) count += 1 for key, value in sheet.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs('E') count += 1 LOGGER.info( 'Secondary structures were assigned to {0} residues.'.format(count)) return atoms