示例#1
0
    def __or__(self, other):

        if self is other:
            return self

        try:
            ag = other.getAtomGroup()
        except AttributeError:
            raise TypeError('other must be an AtomPointer')

        if self._ag != ag:
            raise ValueError('both selections must be from the same AtomGroup')

        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warn('Active coordinate set indices do not match, it will '
                        'be set to zero.')
            acsi = 0

        indices = unique(concatenate(
            (self._getIndices(), other._getIndices())))
        if indices[-1] == atommap.DUMMY:
            indices = indices[:-1]
        return Selection(self._ag,
                         indices,
                         '({0}) or ({1})'.format(self.getSelstr(),
                                                 other.getSelstr()),
                         acsi,
                         unique=True)
示例#2
0
def calcTransformation(mobile, target, weights=None):
    """Returns a :class:`Transformation` instance which, when applied to the
    atoms in *mobile*, minimizes the weighted RMSD between *mobile* and
    *target*.  *mobile* and *target* may be NumPy coordinate arrays, or
    :class:`.Atomic` instances, e.g. :class:`.AtomGroup`, :class:`.Chain`,
    or :class:`.Selection`."""

    if not isinstance(mobile, np.ndarray):
        try:
            mob = mobile._getCoords()
        except AttributeError:
            raise TypeError('mobile must be a numpy array or an object '
                            'with getCoords method')
    else:
        mob = mobile
    if not isinstance(target, np.ndarray):
        try:
            tar = target._getCoords()
        except AttributeError:
            raise TypeError('target must be a numpy array or an object '
                            'with getCoords method')
    else:
        tar = target

    if mob.shape != tar.shape:
        raise ValueError('reference and target coordinate arrays '
                         'must have same number of atoms')

    if mob.shape[1] != 3:
        raise ValueError('reference and target must be coordinate arrays')

    if weights is None:
        if isinstance(mobile, AtomMap):
            LOGGER.warn(
                'mobile is an AtomMap instance, consider assign weights=mobile.getFlags("mapped") '
                'if there are dummy atoms in mobile')

        if isinstance(target, AtomMap):
            LOGGER.warn(
                'target is an AtomMap instance, consider assign weights=target.getFlags("mapped") '
                'if there are dummy atoms in target')

    if weights is not None:
        weights = checkWeights(weights, mob.shape[0])

    return Transformation(*getTransformation(mob, tar, weights))
示例#3
0
def pathPDBFolder(folder=None, divided=False):
    """Returns or specify local PDB folder for storing PDB files downloaded from
    `wwPDB <http://www.wwpdb.org/>`_ servers.  Files stored in this folder can
    be accessed via :func:`.fetchPDB` from any working directory.  To release
    the current folder, pass an invalid path, e.g. ``folder=''``.

    If *divided* is **True**, the divided folder structure of wwPDB servers
    will be assumed when reading from and writing to the local folder.  For
    example, a structure with identifier **1XYZ** will be present as
    :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`.

    If *divided* is **False**, a plain folder structure will be expected and
    adopted when saving files.  For example, the same structure will be
    present as :file:`pdblocalfolder/1xyz.pdb.gz`.

    Finally, in either case, lower case letters will be used and compressed
    files will be stored."""

    if folder is None:
        folder = SETTINGS.get('pdb_local_folder')
        if folder:
            if isdir(folder):
                return folder, SETTINGS.get('pdb_local_divided', True)
            else:
                LOGGER.warn('PDB local folder {0} is not a accessible.'.format(
                    repr(folder)))
    else:
        if isdir(folder):
            folder = abspath(folder)
            LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder)))
            if divided:
                LOGGER.info('wwPDB divided folder structure will be assumed.')
            else:
                LOGGER.info('A plain folder structure will be assumed.')
            SETTINGS['pdb_local_folder'] = folder
            SETTINGS['pdb_local_divided'] = bool(divided)
            SETTINGS.save()
        else:
            current = SETTINGS.pop('pdb_local_folder')
            if current:
                LOGGER.info('PDB folder {0} is released.'.format(
                    repr(current)))
                SETTINGS.pop('pdb_local_divided')
                SETTINGS.save()
            else:
                raise IOError('{0} is not a valid path.'.format(repr(folder)))
示例#4
0
def checkIdentifiers(*pdb):
    """Check whether *pdb* identifiers are valid, and replace invalid ones
    with **None** in place."""

    identifiers = []
    append = identifiers.append
    for pid in pdb:
        try:
            pid = pid.strip().lower()
        except AttributeError:
            LOGGER.warn('{0} is not a valid identifier.'.format(repr(pid)))
            append(None)
        else:
            if not (len(pid) == 4 and pid.isalnum()):
                LOGGER.warn('{0} is not a valid identifier.'
                            .format(repr(pid)))
                append(None)
            else:
                append(pid)
    return identifiers
示例#5
0
def addNonstdAminoacid(resname, *properties):
    """Add non-standard amino acid *resname* with *properties* selected from:

      * {props}

    .. ipython:: python

       addNonstdAminoacid('PTR', 'acidic', 'aromatic', 'cyclic', 'large',
       'polar', 'surface')

    Default set of non-standard amino acids can be restored as follows:

    .. ipython:: python

       flagDefinition(reset='nonstdaa')"""

    resname = str(resname)
    if len(resname) > 4:
        LOGGER.warn('Residue name {0} is unusually long.'.format(
            repr(resname)))
    propset = set(properties)
    for cat, val in CATEGORIES.items():
        intersection = val.intersection(propset)
        if intersection:
            if len(intersection) > 1:
                raise ValueError('amino acid properties {0} cannot be '
                                 'present together'.format(', '.join(
                                     [repr(prp) for prp in intersection])))
            for prop in intersection:
                propset.remove(prop)
    if propset:
        raise ValueError('amino acid property {0} is not valid'.format(
            repr(propset.pop())))

    nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD)
    nonstd[resname] = set(properties)
    updateNonstandard(nonstd)
示例#6
0
def fetchPDBviaFTP(*pdb, **kwargs):
    """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb*
    identifier(s) and return path(s).  Downloaded files will be stored in
    local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied
    into *folder*, if specified by the user.  If no destination folder is
    specified, files will be saved in the current working directory.  If
    *compressed* is **False**, decompressed files will be copied into
    *folder*.  *format* keyword argument can be used to retrieve
    `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_
    and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ 
    files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file,
    and ``format='xml'`` will fetch a PDBML file. 
    If PDBML header file is desired, ``noatom=True`` argument will do the job."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))
    format = str(kwargs.pop('format', 'pdb')).lower()
    noatom = bool(kwargs.pop('noatom', False))

    if format == 'pdb':
        ftp_divided = 'pdb/data/structures/divided/pdb'
        ftp_pdbext = '.ent.gz'
        ftp_prefix = 'pdb'
        extension = '.pdb'
    elif format == 'xml':
        if noatom:
            ftp_divided = 'pdb/data/structures/divided/XML-noatom'
            ftp_pdbext = '-noatom.xml.gz'
            extension = '-noatom.xml'
        else:
            ftp_divided = 'pdb/data/structures/divided/XML'
            ftp_pdbext = '.xml.gz'
            extension = '.xml'
        ftp_prefix = ''
    elif format == 'cif':
        ftp_divided = 'pdb/data/structures/divided/mmCIF'
        ftp_pdbext = '.cif.gz'
        ftp_prefix = ''
        extension = '.cif'
    elif format == 'emd' or format == 'map':
        ftp_divided = 'emdb/structures'
        ftp_pdbext = '.map.gz'
        ftp_prefix = 'emd_'
        extension = '.map'
    else:
        raise ValueError(repr(format) + ' is not valid format')

    local_folder = pathPDBFolder()

    if format == 'pdb' and local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(filename,
                            join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(filename,
                            join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))


    ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us']
    LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name))

    from ftplib import FTP
    try:
        ftp = FTP(ftp_host)
    except Exception as error:
        raise type(error)('FTP connection problem, potential reason: '
                          'no internet connectivity')
    else:
        success = 0
        failure = 0
        filenames = []
        ftp.login('')
        for pdb in identifiers:
            if pdb is None:
                filenames.append(None)
                continue
            data = []
            ftp_fn = ftp_prefix + pdb + ftp_pdbext
            try:
                ftp.cwd(ftp_path)
                ftp.cwd(ftp_divided)
                if format == 'emd':
                    ftp.cwd('EMD-{0}/map'.format(pdb))
                else:
                    ftp.cwd(pdb[1:3])
                ftp.retrbinary('RETR ' + ftp_fn, data.append)
            except Exception as error:
                if ftp_fn in ftp.nlst():
                    LOGGER.warn('{0} download failed ({1}). It is '
                                'possible that you do not have rights to '
                                'download .gz files in the current network.'
                                .format(pdb, str(error)))
                else:
                    LOGGER.info('{0} download failed. {1} does not exist '
                                'on {2}.'.format(ftp_fn, pdb, ftp_host))
                failure += 1
                filenames.append(None)
            else:
                if len(data):
                    filename = getPath(pdb)

                    with open(filename, 'w+b') as pdbfile:
                        write = pdbfile.write
                        [write(block) for block in data]

                    filename = normpath(relpath(second(filename, pdb)))
                    LOGGER.debug('{0} downloaded ({1})'
                                 .format(pdb, sympath(filename)))
                    success += 1
                    filenames.append(filename)
                else:
                    LOGGER.warn('{0} download failed, reason unknown.'
                                .format(pdb))
                    failure += 1
                    filenames.append(None)

        ftp.quit()

    LOGGER.debug('PDB download via FTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
示例#7
0
def parseImagesFromSTAR(particlesSTAR, **kwargs):
    """
    Parses particle images using data from a STAR file 
    containing information about them.

    :arg particlesSTAR: a filename for a STAR file.
    :type particlesSTAR: str

    :arg block_indices: indices for data blocks containing rows 
        corresponding to images of interest
        The indexing scheme is similar to that for numpy arrays.
        Default behavior is use all data blocks about images
    :type block_indices: list, :class:`~numpy.ndarray`

    :arg row_indices: indices for rows corresponding to images of interest
        The indexing scheme is similar to that for numpy arrays. 
        row_indices should be a 1D or 2D array-like.
        2D row_indices should contain an entry for each relevant loop. 
        If a 1D array-like is given the same row indices 
        will be applied to all loops.
        Default behavior is to use all rows about images
    :type row_indices: list, :class:`~numpy.ndarray`

    :arg particle_indices: indices for particles regardless of STAR structure
        default is take all particles
        Please note: this acts after block_indices and row_indices
    :type particle_indices: list, :class"`~numpy.ndarray`

    :arg saveImageArrays: whether to save the numpy array for each image to file
        default is False
    :type saveImageArrays: bool

    :arg saveDirectory: directory where numpy image arrays are saved
        default is None, which means save to the current working directory
    :type saveDirectory: str, None

    :arg rotateImages: whether to apply in plane translations and rotations using 
        provided psi and origin data, default is True
    :type rotateImages: bool 
    
    """

    try:
        from skimage.transform import rotate
    except ImportError:
        raise ImportError('This function requires scikit-image.')

    block_indices = kwargs.get('block_indices', None)
    # No loop_indices because data blocks about particle images contain 1 loop
    row_indices = kwargs.get('row_indices', None)
    particle_indices = kwargs.get('particle_indices', None)

    saveImageArrays = kwargs.get('saveImageArrays', False)
    saveDirectory = kwargs.get('saveDirectory', None)
    rotateImages = kwargs.get('rotateImages', True)

    try:
        particlesSTAR = parseSTAR(particlesSTAR)
    except:
        raise ValueError('particlesSTAR should be a filename for a STAR file')

    # Check dimensions/contents of particlesSTAR and generate full indices
    dataBlocks = []
    loops = []
    maxLoops = 0
    maxRows = 0
    dataBlock_goodness = []
    for dataBlock in particlesSTAR:

        foundImageField = False
        for loop in dataBlock:
            if ('_image' in loop.fields) or ('_rlnImageName' in loop.fields):
                foundImageField = True
                loops.append(loop)
                if loop.numRows > maxRows:
                    maxRows = loop.numRows
            else:
                dataBlock.pop(int(loop.getTitle().split(' ')[-1]))

        if dataBlock.numLoops > maxLoops:
            maxLoops = dataBlock.numLoops

        if foundImageField:
            dataBlocks.append(dataBlock)
            dataBlock_goodness.append(True)
        else:
            dataBlock_goodness.append(False)

    indices = np.zeros((len(dataBlocks), maxLoops, maxRows, 3), dtype=int)
    i = -1
    for n, dataBlock in enumerate(particlesSTAR):
        if dataBlock_goodness[n]:
            i += 1
            for j, loop in enumerate(dataBlock):
                for k in range(maxRows):
                    if k < loop.numRows:
                        indices[i, j, k] = np.array([n, j, k])
                    else:
                        indices[i, j, k] = np.array([0, 0, 0])

    dataBlocks = np.array(dataBlocks)
    loops = np.array(loops)

    # Convert keyword indices to valid indices if possible
    if block_indices is not None:
        if np.array_equal(dataBlocks, np.array([])):
            raise TypeError(
                'particlesSTAR must have data blocks to use block_indices')

        try:
            block_indices = np.array(block_indices)
        except:
            raise TypeError('block_indices should be array-like')

        if block_indices.ndim != 1:
            raise ValueError(
                'block_indices should be a 1-dimensional array-like')

        for i, index in enumerate(list(reversed(block_indices))):
            try:
                block = particlesSTAR[index]
                if not isinstance(block, StarDataBlock):
                    LOGGER.warn(
                        'There is no block corresponding to block_index {0}. '
                        'This index has been removed.'.format(
                            block_indices.shape[0] - i - 1))
                    block_indices = np.delete(block_indices, i, 0)
            except:
                LOGGER.warn(
                    'There is no block corresponding to block_index {0}. '
                    'This index has been removed.'.format(
                        block_indices.shape[0] - i - 1))
                block_indices = np.delete(block_indices, i, 0)

        if not np.array_equal(block_indices, np.array([])):
            indices = np.concatenate(([
                indices[np.where(indices[:, 0, 0, 0] == item)]
                for item in block_indices
            ]),
                                     axis=0)
        else:
            LOGGER.warn(
                'None of the block_indices corresponded to dataBlocks. '
                'Default block indices corresponding to all dataBlocks '
                'will be used instead.')

    dataBlocks = particlesSTAR[block_indices]

    if row_indices is not None:
        try:
            row_indices = np.array(row_indices)
        except:
            raise TypeError('row_indices should be array-like')

        if row_indices.ndim == 1:
            if isinstance(row_indices[0], int):
                # row_indices provided was truly 1D so
                # we will use same row indices for all data blocks
                # and warn the user we are doing so
                if len(dataBlocks) != 1:
                    LOGGER.warn(
                        'row_indices is 1D but there are multiple data blocks '
                        'so the same row indices will be used for each')

                row_indices = np.array(
                    [row_indices for i in range(len(dataBlocks))])
                # This also works if len(dataBlocks) == 1

            elif isinstance(row_indices[0], (list, tuple)):
                # A list-like of list-likes of different sizes was provided
                # We turn it into a proper 2D array by filling the short
                # list likes with zeros

                if len(row_indices) != len(dataBlocks):
                    raise ValueError(
                        'There should be an entry in row indices for '
                        'each data block')

                max_len = 0
                for entry in row_indices:
                    if not np.isscalar(entry):
                        if len(entry) > max_len:
                            max_len = len(entry)

                row_indices_list_entries = []
                for entry in row_indices:
                    if isinstance(entry, int):
                        list_entry = [entry]
                    else:
                        list_entry = list(entry)

                    while len(list_entry) < max_len:
                        list_entry.append(0)

                    row_indices_list_entries.append(list_entry)

                row_indices = np.array(row_indices_list_entries)

        elif row_indices.ndim == 2:
            # A list-like of list-likes of the same size was provided
            if row_indices.shape[0] != len(dataBlocks):
                if len(row_indices) == 1:
                    # we will use same row indices for all data blocks
                    # and warn the user we are doing so
                    if len(dataBlocks) != 1:
                        LOGGER.warn(
                            'row_indices has one entry but there are multiple data blocks '
                            'so the same row indices will be used for each')

                    row_indices = np.array(
                        [row_indices[0] for i in range(len(dataBlocks))])
                    # This also works if len(dataBlocks) == 1
                else:
                    raise ValueError(
                        'There should be an entry in row indices for '
                        'each data block')

        else:
            raise ValueError(
                'row_indices should be 1D or 2D array-like objects')

        # indices need updating
        good_indices_list = []
        for i, index_i in enumerate(indices):
            good_indices_list.append([])
            for j, index_j in enumerate(index_i):
                good_indices_list[i].append([])
                for r, index_r in enumerate(row_indices[i]):
                    for k, index_k in enumerate(index_j):
                        if k == index_r:
                            if not (r != 0 and index_r == 0):
                                good_indices_list[i][j].append(index_k)
                            else:
                                good_indices_list[i][j].append(
                                    np.array([0, 0, 0]))

        indices = np.array(good_indices_list)

    if indices is np.array([]):
        raise ValueError(
            'selection does not contain any rows with image fields')

    # Use indices to collect particle data dictionaries
    particles = []

    for i, index_i in enumerate(indices):
        for j, index_j in enumerate(index_i):
            for k, index_k in enumerate(index_j):
                if not (np.array_equal(index_k, np.array([0, 0, 0]))
                        and not (i == 0 and j == 0 and k == 0)):
                    particles.append(
                        particlesSTAR[index_k[0]][index_k[1]][index_k[2]])

    if particle_indices is None:
        particle_indices = list(range(len(particles)))

    # Parse images using particle dictionaries
    image_stacks = dict()
    images = []
    parsed_images_data = []
    stk_images = []
    if particlesSTAR._prog == 'XMIPP':
        imageFieldKey = '_image'
    else:
        imageFieldKey = '_rlnImageName'

    for i in particle_indices:
        particle = particles[i]

        try:
            image_field = particle[imageFieldKey]
            image_index = int(image_field.split('@')[0]) - 1
            filename = image_field.split('@')[1]
        except:
            raise ValueError(
                'particlesSTAR does not contain data about particle image '
                '{0} location in either RELION or XMIPP format'.format(i))

        if filename.endswith('.stk'):
            stk_images.append(str(i))
            continue

        if not filename in list(image_stacks.keys()):
            image_stacks[filename] = parseEMD(filename).density

        image = image_stacks[filename][image_index]
        parsed_images_data.append(image_field)

        if saveImageArrays:
            if saveDirectory is not None:
                np.save('{0}/{1}'.format(saveDirectory, i), image)
            else:
                np.save('{1}'.format(i), image)

        if rotateImages:
            if particlesSTAR._prog == 'RELION':
                anglePsi = float(particle['_rlnAnglePsi'])
                originX = float(particle['_rlnOriginX'])
                originY = float(particle['_rlnOriginY'])
            elif particlesSTAR._prog == 'XMIPP':
                anglePsi = float(particle['_anglePsi'])
                originX = float(particle['_shiftX'])
                originY = float(particle['_shiftY'])
            images.append(
                rotate(image,
                       anglePsi,
                       center=(float(image.shape[0]) - originX,
                               float(image.shape[1]) - originY)))
        else:
            images.append(image)

    if len(stk_images) > 0:
        LOGGER.warn(
            'ProDy currently cannot parse images from XMIPP .stk files. '
            'Please be aware that images {0} and {1} will be missing '
            'from the final array.'.format(', '.join(stk_images[:-1]),
                                           stk_images[-1]))

    return np.array(images), parsed_images_data
示例#8
0
def parseMMCIFStream(stream, **kwargs):
    """Returns an :class:`.AtomGroup` and/or a class:`.StarDict` 
    containing header data parsed from a stream of CIF lines.
    :arg stream: Anything that implements the method ``readlines``
        (e.g. :class:`file`, buffer, stdin)"""

    model = kwargs.get('model')
    subset = kwargs.get('subset')
    chain = kwargs.get('chain')
    altloc = kwargs.get('altloc', 'A')
    header = kwargs.get('header', False)

    if model is not None:
        if isinstance(model, int):
            if model < 0:
                raise ValueError('model must be greater than 0')
        else:
            raise TypeError('model must be an integer, {0} is invalid'
                            .format(str(model)))
    title_suffix = ''
    if subset:
        try:
            subset = _PDBSubsets[subset.lower()]
        except AttributeError:
            raise TypeError('subset must be a string')
        except KeyError:
            raise ValueError('{0} is not a valid subset'
                             .format(repr(subset)))
        title_suffix = '_' + subset
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix

    ag = None
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    elif model != 0:
        ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
        n_csets = 0

    if model != 0:
        LOGGER.timeit()
        try:
            lines = stream.readlines()
        except AttributeError as err:
            try:
                lines = stream.read().split('\n')
            except AttributeError:
                raise err
        if not len(lines):
            raise ValueError('empty PDB file or stream')

        if header:
            ag, header = _parseMMCIFLines(ag, lines, model, chain, subset,
                                          altloc, header)
        else:
            ag = _parseMMCIFLines(ag, lines, model, chain, subset,
                                  altloc, header)

        if ag.numAtoms() > 0:
            LOGGER.report('{0} atoms and {1} coordinate set(s) were '
                          'parsed in %.2fs.'.format(ag.numAtoms(),
                                                    ag.numCoordsets() - n_csets))
        else:
            ag = None
            LOGGER.warn('Atomic data could not be parsed, please '
                        'check the input file.')
        if header:
            return ag, StarDict(*header, title=str(kwargs.get('title', 'Unknown')))
        return ag
示例#9
0
def refineEnsemble(ensemble, lower=.5, upper=10., **kwargs):
    """Refine a :class:`.PDBEnsemble` based on RMSD criterions.
    
    :arg ensemble: the ensemble to be refined
    :type ensemble: :class:`.Ensemble`, :class:`.PDBEnsemble`

    :arg lower: the smallest allowed RMSD between two conformations with the exception of **protected** 
    :type lower: float

    :arg upper: the highest allowed RMSD between two conformations with the exception of **protected** 
    :type upper: float

    :keyword protected: a list of either the indices or labels of the conformations needed to be kept 
                        in the refined ensemble
    :type protected: list
    
    :arg ref: the index or label of the reference conformation which will also be kept.
        Default is 0
    :type ref: int or str
    """

    protected = kwargs.pop('protected', [])
    P = []
    if len(protected):
        labels = ensemble.getLabels()
        for p in protected:
            if isinstance(p, Integral):
                i = p
            else:
                if p in labels:
                    i = labels.index(p)
                else:
                    LOGGER.warn(
                        'could not find any conformation with the label %s in the ensemble'
                        % str(p))
            P.append(i)

    LOGGER.timeit('_prody_refineEnsemble')
    from numpy import argsort

    ### obtain reference index
    # rmsd = ensemble.getRMSDs()
    # ref_i = np.argmin(rmsd)
    ref_i = kwargs.pop('ref', 0)
    if isinstance(ref_i, Integral):
        pass
    elif isinstance(ref_i, str):
        labels = ensemble.getLabels()
        ref_i = labels.index(ref_i)
    else:
        LOGGER.warn(
            'could not find any conformation with the label %s in the ensemble'
            % str(ref_i))
    if not ref_i in P:
        P = [ref_i] + P

    ### calculate pairwise RMSDs ###
    RMSDs = ensemble.getRMSDs(pairwise=True)

    def getRefinedIndices(A):
        deg = A.sum(axis=0)
        sorted_indices = list(argsort(deg))
        # sorted_indices = P + [x for x in sorted_indices if x not in P]
        sorted_indices.remove(ref_i)
        sorted_indices.insert(0, ref_i)

        n_confs = ensemble.numConfs()
        isdel_temp = np.zeros(n_confs)
        for a in range(n_confs):
            i = sorted_indices[a]
            for b in range(n_confs):
                if a >= b:
                    continue
                j = sorted_indices[b]
                if isdel_temp[i] or isdel_temp[j]:
                    continue
                else:
                    if A[i, j]:
                        # isdel_temp[j] = 1
                        if not j in P:
                            isdel_temp[j] = 1
                        elif not i in P:
                            isdel_temp[i] = 1
        temp_list = isdel_temp.tolist()
        ind_list = []
        for i in range(n_confs):
            if not temp_list[i]:
                ind_list.append(i)
        return ind_list

    L = list(range(len(ensemble)))
    U = list(range(len(ensemble)))
    if lower is not None:
        A = RMSDs < lower
        L = getRefinedIndices(A)

    if upper is not None:
        B = RMSDs > upper
        U = getRefinedIndices(B)

    # find common indices from L and U
    I = list(set(L) - (set(L) - set(U)))

    # for p in P:
    # if p not in I:
    # I.append(p)
    I.sort()
    reens = ensemble[I]

    LOGGER.report('Ensemble was refined in %.2fs.', '_prody_refineEnsemble')
    LOGGER.info('%d conformations were removed from ensemble.' %
                (len(ensemble) - len(I)))

    return reens
示例#10
0
def buildBiomolecules(header, atoms, biomol=None):
    """Returns *atoms* after applying biomolecular transformations from *header*
    dictionary.  Biomolecular transformations are applied to all coordinate
    sets in the molecule.

    Some PDB files contain transformations for more than 1 biomolecules.  A
    specific set of transformations can be choosen using *biomol* argument.
    Transformation sets are identified by numbers, e.g. ``"1"``, ``"2"``, ...

    If multiple biomolecular transformations are provided in the *header*
    dictionary, biomolecules will be returned as
    :class:`.AtomGroup` instances in a :func:`list`.

    If the resulting biomolecule has more than 26 chains, the molecular
    assembly will be split into multiple :class:`.AtomGroup`
    instances each containing at most 26 chains.  These
    :class:`.AtomGroup` instances will be returned in a tuple.

    Note that atoms in biomolecules are ordered according to chain identifiers.
    """

    if not isinstance(header, dict):
        raise TypeError('header must be a dictionary')

    if not isinstance(atoms, Atomic):
        raise TypeError('atoms must be an Atomic instance')

    biomt = header.get('biomoltrans')
    if not isinstance(biomt, dict) or len(biomt) == 0:
        LOGGER.warn(
            "no biomolecular transformations found so original structure was used"
        )
        return atoms

    if not isinstance(atoms, AtomGroup):
        atoms = atoms.copy()

    biomols = []
    if biomol is None:
        keys = list(biomt)
    else:
        biomol = str(biomol)
        if biomol in biomt:
            keys = [biomol]
        else:
            LOGGER.warn('Transformations for biomolecule {0} was not '
                        'found in the header dictionary.'.format(biomol))
            return None

    keys.sort()
    for i in keys:
        segnm = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ' * 20)
        ags = []
        mt = biomt[i]
        # mt is a list, first item is list of chain identifiers
        # following items are lines corresponding to transformation
        # mt must have 3n + 1 lines
        if (len(mt)) % 4 != 0:
            LOGGER.warn('Biomolecular transformations {0} were not '
                        'applied'.format(i))
            continue

        for times in range(int((len(mt)) / 4)):
            rotation = np.zeros((3, 3))
            translation = np.zeros(3)
            line0 = np.fromstring(mt[times * 4 + 1], sep=' ')
            rotation[0, :] = line0[:3]
            translation[0] = line0[3]
            line1 = np.fromstring(mt[times * 4 + 2], sep=' ')
            rotation[1, :] = line1[:3]
            translation[1] = line1[3]
            line2 = np.fromstring(mt[times * 4 + 3], sep=' ')
            rotation[2, :] = line2[:3]
            translation[2] = line2[3]
            t = Transformation(rotation, translation)

            newag = atoms.select('chain ' + ' '.join(mt[times * 4 + 0])).copy()
            if newag is None:
                continue
            newag.all.setSegnames(segnm.pop(0))
            for acsi in range(newag.numCoordsets()):
                newag.setACSIndex(acsi)
                newag = t.apply(newag)
            newag.setACSIndex(0)
            ags.append(newag)

        if ags:
            newag = ags.pop(0)
            while ags:
                newag += ags.pop(0)
            newag.setTitle('{0} biomolecule {1}'.format(atoms.getTitle(), i))
            biomols.append(newag)

    if biomols:
        if len(biomols) == 1:
            return biomols[0]
        else:
            return biomols
    else:
        return None
示例#11
0
def fetchPDB(*pdb, **kwargs):
    """Returns path(s) to PDB file(s) for specified *pdb* identifier(s).  Files
    will be sought in user specified *folder* or current working directory, and
    then in local PDB folder and mirror, if they are available.  If *copy*
    is set **True**, files will be copied into *folder*.  If *compressed* is
    **False**, all files will be decompressed into *folder*.  See :func:`pathPDBFolder` 
    and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP`
    and :func:`.fetchPDBviaHTTP` for downloading files from PDB servers."""

    if len(pdb) == 1 and isinstance(pdb[0], list):
        pdb = pdb[0]

    if 'format' in kwargs and kwargs.get('format') != 'pdb':
        return fetchPDBviaFTP(*pdb, **kwargs)

    identifiers = checkIdentifiers(*pdb)

    folder = kwargs.get('folder', '.')
    compressed = kwargs.get('compressed')

    # check *folder* specified by the user, usually pwd ('.')
    filedict = findPDBFiles(folder, compressed=compressed)

    filenames = []
    not_found = []
    exists = 0
    for i, pdb in enumerate(identifiers):
        if pdb is None:
            filenames.append(None)
        elif pdb in filedict:
            filenames.append(filedict[pdb])
            exists += 1
        else:
            filenames.append(None)
            not_found.append((i, pdb))

    if not not_found:
        if len(filenames) == 1:
            filenames = filenames[0]
            if exists:
                LOGGER.debug(
                    'PDB file is found in working directory ({0}).'.format(
                        sympath(filenames)))
        return filenames

    if not isWritable(folder):
        raise IOError('permission to write in {0} is denied, please '
                      'specify another folder'.format(folder))

    if compressed is not None and not compressed:
        filedict = findPDBFiles(folder, compressed=True)
        not_found, decompress = [], not_found
        for i, pdb in decompress:
            if pdb in filedict:
                fn = filedict[pdb]
                filenames[i] = gunzip(fn, splitext(fn)[0])
            else:
                not_found.append((i, pdb))

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    local_folder = pathPDBFolder()
    copy = kwargs.setdefault('copy', False)
    if local_folder:
        local_folder, is_divided = local_folder
        temp, not_found = not_found, []
        for i, pdb in temp:
            if is_divided:
                fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz')
            else:
                fn = join(local_folder, pdb + '.pdb.gz')
            if isfile(fn):
                if copy or not compressed and compressed is not None:
                    if compressed:
                        fn = copyFile(fn, join(folder, pdb + 'pdb.gz'))
                    else:
                        fn = gunzip(fn, join(folder, pdb + '.pdb'))
                filenames[i] = normpath(fn)
            else:
                not_found.append((i, pdb))

    if not not_found:
        if len(identifiers) == 1:
            fn = filenames[0]
            items = fn.split(pathsep)
            if len(items) > 5:
                fndisp = pathsep.join(items[:3] + ['...'] + items[-1:])
            else:
                fndisp = relpath(fn)
            LOGGER.debug(
                'PDB file is found in the local folder ({0}).'.format(fndisp))
            return fn
        else:
            return filenames

    if kwargs['copy'] or (compressed is not None and not compressed):
        kwargs['folder'] = folder

    downloads = [pdb for i, pdb in not_found]
    fns = None

    try:
        fns = fetchPDBfromMirror(*downloads, **kwargs)
    except IOError:
        pass
    else:
        if len(downloads) == 1: fns = [fns]
        temp, not_found = not_found, []
        for i, fn in enumerate(fns):
            if fn is None:
                not_found.append(temp[i])
            else:
                i, _ = temp[i]
                filenames[i] = fn

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    if fns:
        downloads = [pdb for i, pdb in not_found]

    fns = None

    tp = kwargs.pop('tp', None)
    if tp is not None:
        tp = tp.lower()

    if tp == 'http':
        try:
            fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs)
        except Exception as err:
            LOGGER.warn('Downloading PDB files via HTTP failed '
                        '({0}).'.format(str(err)))
    elif tp == 'ftp':
        try:
            fns = fetchPDBviaFTP(*downloads, check=False, **kwargs)
        except Exception as err:
            LOGGER.warn('Downloading PDB files via FTP failed '
                        '({0}).'.format(str(err)))
    else:
        tryHTTP = False
        try:
            fns = fetchPDBviaFTP(*downloads, check=False, **kwargs)
        except Exception as err:
            tryHTTP = True

        if fns is None or isinstance(fns, list) and None in fns:
            tryHTTP = True
        elif isinstance(fns, list):
            downloads = [
                not_found[i][1] for i in range(len(fns)) if fns[i] is None
            ]
            if len(downloads) > 0:
                tryHTTP = True
        if tryHTTP:
            LOGGER.info('Downloading PDB files via FTP failed, '
                        'trying HTTP.')
            try:
                fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs)
            except Exception as err:
                LOGGER.warn('Downloading PDB files via HTTP also failed '
                            '({0}).'.format(str(err)))

    if len(downloads) == 1: fns = [fns]
    if fns:
        for i, fn in zip([i for i, pdb in not_found], fns):
            filenames[i] = fn

    return filenames[0] if len(identifiers) == 1 else filenames
示例#12
0
    def copy(self):
        """Returns a copy of atoms (and atomic data) in an :class:`.AtomGroup`
        instance."""

        dummies = None
        indices = None
        readonly = False
        try:
            ag = self.getAtomGroup()
        except AttributeError:
            ag = self
            readonly = True
            new = AtomGroup(ag.getTitle())
        else:
            indices = self.getIndices()
            new = AtomGroup(ag.getTitle() + ' ' + str(self))
            try:
                dummies = self.numDummies()
            except AttributeError:
                pass
            else:
                if dummies:
                    dummy = self.getFlags('dummy')
                    mapped = self.getFlags('mapped')

        try:
            self.getIndex()
        except AttributeError:
            this = self
        else:
            this = self.all

        if self.numCoordsets():
            new.setCoords(this.getCoordsets(), label=ag.getCSLabels())

        for label in ag.getDataLabels():
            if label in READONLY:
                if readonly:
                    new._data[label] = this.getData(label)
            else:
                new.setData(label, this.getData(label))

        #if readonly:
        #    for label in READONLY:
        #        data = this.getData(label)
        #        if data is not None:
        #            new._data[label] = data

        skip_flags = set()
        for label in ag.getFlagLabels():
            if label in skip_flags:
                continue
            else:
                new._setFlags(label, this.getFlags(label))
                skip_flags.update(flags.ALIASES.get(label, [label]))

        if dummies:
            new._setFlags('dummy', dummy)
            new._setFlags('mapped', mapped)

        bonds = ag._bonds
        bmap = ag._bmap
        if bonds is not None and bmap is not None:
            if indices is None:
                new._bonds = bonds.copy()
                new._bmap = bmap.copy()
                new._data['numbonds'] = ag._data['numbonds'].copy()
            elif dummies:
                if dummies:
                    indices = indices[self._getMapping()]
                if len(set(indices)) == len(indices):
                    new.setBonds(trimBonds(bonds, indices))
                else:
                    LOGGER.warn('Duplicate atoms in mapping, bonds are '
                                'not copied.')
            else:
                bonds = trimBonds(bonds, indices)
                if bonds is not None:
                    new.setBonds(bonds)
        return new
示例#13
0
def writeDCD(filename,
             trajectory,
             start=None,
             stop=None,
             step=None,
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""
    if not filename.lower().endswith('.dcd'):
        filename += '.dcd'

    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0} is not a valid type for trajectory'.format(
            type(trajectory)))

    irange = list(
        range(*slice(start, stop, step).indices(trajectory.numCoordsets())))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')

    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex()
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0

    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        prev = i
        if isTrajectory:
            if diff > 1:
                trajectory.skip(diff - 1)
            frame = next(trajectory)
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:]))
                uc = uc[[0, 3, 1, 4, 5, 2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i)
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(),
                      uc,
                      timestep=timestep,
                      firsttimestep=first_ts,
                      framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, label='_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.finish()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format(
        dcd_size, dcd_size / time_))
    LOGGER.info(
        '{0} coordinate sets written at output rate {1} frame/s.'.format(
            n_csets, int(n_csets / time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0} frames expected, {1} written.'.format(
            n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
示例#14
0
def fetchPDBLigand(cci, filename=None):
    """Fetch PDB ligand data from PDB_ for chemical component *cci*.
    *cci* may be 3-letter chemical component identifier or a valid XML
    filename.  If *filename* is given, XML file will be saved with that name.

    If you query ligand data frequently, you may configure ProDy to save XML
    files in your computer.  Set ``ligand_xml_save`` option **True**, i.e.
    ``confProDy(ligand_xml_save=True)``.  Compressed XML files will be save
    to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`.  Each
    file is around 5Kb when compressed.

    This function is compatible with PDBx/PDBML v 4.0.

    Ligand data is returned in a dictionary.  Ligand coordinate atom data with
    *model* and *ideal* coordinate sets are also stored in this dictionary.
    Note that this dictionary will contain data that is present in the XML
    file and all Ligand Expo XML files do not contain every possible data
    field.  So, it may be better if you use :meth:`dict.get` instead of
    indexing the dictionary, e.g. to retrieve formula weight (or relative
    molar mass) of the chemical component use ``data.get('formula_weight')``
    instead of ``data['formula_weight']`` to avoid exceptions when this data
    field is not found in the XML file.  URL and/or path of the XML file are
    returned in the dictionary with keys ``url`` and ``path``, respectively.

    Following example downloads data for ligand STI (a.k.a. Gleevec and
    Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and
    ideal (energy minimized) coordinate sets:

    .. ipython:: python

       from caviar.prody_parser import *
       ligand_data = fetchPDBLigand('STI')
       ligand_data['model_coordinates_db_code']
       ligand_model = ligand_data['model']
       ligand_ideal = ligand_data['ideal']
       transformation = superpose(ligand_ideal.noh, ligand_model.noh)
       calcRMSD(ligand_ideal.noh, ligand_model.noh)"""

    if not isinstance(cci, str):
        raise TypeError('cci must be a string')

    if isfile(cci):
        inp = openFile(cci)
        xml = inp.read()
        inp.close()
        url = None
        path = cci
        cci = splitext(splitext(split(cci)[1])[0])[0].upper()
    elif len(cci) > 4 or not cci.isalnum():
        raise ValueError('cci must be 3-letters long and alphanumeric or '
                         'a valid filename')
    else:
        xml = None
        cci = cci.upper()
        if SETTINGS.get('ligand_xml_save'):
            folder = join(getPackagePath(), 'pdbligands')
            if not isdir(folder):
                makePath(folder)
            xmlgz = path = join(folder, cci + '.xml.gz')
            if isfile(xmlgz):
                with openFile(xmlgz) as inp:
                    xml = inp.read()
        else:
            folder = None
            path = None

        url = ('http://files.rcsb.org/ligands/download/{0}'
               '.xml'.format(cci.upper()))
        if not xml:
            try:
                inp = openURL(url)
            except IOError:
                raise IOError(
                    'XML file for ligand {0} is not found online'.format(cci))
            else:
                xml = inp.read()
                if PY3K:
                    xml = xml.decode()
                inp.close()

            if filename:
                out = openFile(filename, mode='w', folder=folder)
                out.write(xml)
                out.close()
            if SETTINGS.get('ligand_xml_save'):
                with openFile(xmlgz, 'w') as out:
                    out.write(xml)

    import xml.etree.cElementTree as ET

    root = ET.XML(xml)
    if (root.get('{http://www.w3.org/2001/XMLSchema-instance}'
                 'schemaLocation') !=
            'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'):
        LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting '
                    'dictionary may not contain all data fields')
    ns = root.tag[:root.tag.rfind('}') + 1]
    len_ns = len(ns)
    dict_ = {'url': url, 'path': path}

    for child in list(root.find(ns + 'chem_compCategory')[0]):
        tag = child.tag[len_ns:]
        if tag.startswith('pdbx_'):
            tag = tag[5:]
        dict_[tag] = child.text
    dict_['formula_weight'] = float(dict_.get('formula_weight'))

    identifiers_and_descriptors = []
    results = root.find(ns + 'pdbx_chem_comp_identifierCategory')
    if results:
        identifiers_and_descriptors.extend(results)
    results = root.find(ns + 'pdbx_chem_comp_descriptorCategory')
    if results:
        identifiers_and_descriptors.extend(results)
    for child in identifiers_and_descriptors:
        program = child.get('program').replace(' ', '_')
        type_ = child.get('type').replace(' ', '_')
        dict_[program + '_' + type_] = child[0].text
        dict_[program + '_version'] = child.get('program_version')

    dict_['audits'] = [
        (audit.get('action_type'), audit.get('date'))
        for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory'))
    ]

    atoms = list(root.find(ns + 'chem_comp_atomCategory'))
    n_atoms = len(atoms)
    ideal_coords = np.zeros((n_atoms, 3))
    model_coords = np.zeros((n_atoms, 3))

    atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype)
    elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype)
    resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype)
    charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype)

    resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype)

    alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype)
    leaving_atom_flags = np.zeros(n_atoms, np.bool)
    aromatic_flags = np.zeros(n_atoms, np.bool)
    stereo_configs = np.zeros(n_atoms, np.bool)
    ordinals = np.zeros(n_atoms, int)

    name2index = {}

    for i, atom in enumerate(atoms):
        data = dict([(child.tag[len_ns:], child.text) for child in list(atom)])

        name = data.get('pdbx_component_atom_id', 'X')
        name2index[name] = i
        atomnames[i] = name
        elements[i] = data.get('type_symbol', 'X')
        resnames[i] = data.get('pdbx_component_comp_id', 'UNK')
        charges[i] = float(data.get('charge', 0))

        alternate_atomnames[i] = data.get('alt_atom_id', 'X')
        leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y'
        aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y'
        stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y'
        ordinals[i] = int(data.get('pdbx_ordinal', 0))

        model_coords[i, 0] = float(data.get('model_Cartn_x', 0))
        model_coords[i, 1] = float(data.get('model_Cartn_y', 0))
        model_coords[i, 2] = float(data.get('model_Cartn_z', 0))
        ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0))
        ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0))
        ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0))

    pdbid = dict_.get('model_coordinates_db_code')
    if pdbid:
        model = AtomGroup(cci + ' model ({0})'.format(pdbid))
    else:
        model = AtomGroup(cci + ' model')
    model.setCoords(model_coords)
    model.setNames(atomnames)
    model.setResnames(resnames)
    model.setResnums(resnums)
    model.setElements(elements)
    model.setCharges(charges)
    model.setFlags('leaving_atom_flags', leaving_atom_flags)
    model.setFlags('aromatic_flags', aromatic_flags)
    model.setFlags('stereo_configs', stereo_configs)
    model.setData('ordinals', ordinals)
    model.setData('alternate_atomnames', alternate_atomnames)
    dict_['model'] = model
    ideal = model.copy()
    ideal.setTitle(cci + ' ideal')
    ideal.setCoords(ideal_coords)
    dict_['ideal'] = ideal

    bonds = []
    warned = set()
    for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds):
        name_1 = bond.get('atom_id_1')
        name_2 = bond.get('atom_id_2')
        try:
            bonds.append((name2index[name_1], name2index[name_2]))
        except KeyError:
            if name_1 not in warned and name_1 not in name2index:
                warned.add(name_1)
                LOGGER.warn('{0} specified {1} in bond category is not '
                            'a valid atom name.'.format(repr(name_1), cci))
            if name_2 not in warned and name_2 not in name2index:
                warned.add(name_2)
                LOGGER.warn('{0} specified {1} in bond category is not '
                            'a valid atom name.'.format(repr(name_2), cci))
    if bonds:
        bonds = np.array(bonds, int)
        model.setBonds(bonds)
        ideal.setBonds(bonds)
    return dict_
示例#15
0
def _getPolymers(lines):
    """Returns list of polymers (macromolecules)."""

    pdbid = lines['pdbid']
    polymers = dict()
    for i, line in lines['SEQRES']:
        ch = line[11]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.sequence += ''.join(getSequence(line[19:].split()))

    for i, line in lines['DBREF ']:
        i += 1

        ch = line[12]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('DBREF chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        dbabbr = line[26:32].strip()
        dbref = DBRef()
        dbref.dbabbr = dbabbr
        dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown')
        dbref.accession = line[33:41].strip()
        dbref.idcode = line[42:54].strip()

        try:
            first = int(line[14:18])
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'initial sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            last = int(line[20:24])
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'ending sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.first = (first, line[18], int(line[56:60]))
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'initial sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.last = (last, line[24].strip(), int(line[62:67]))
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'ending sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))

        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.dbrefs.append(dbref)

    dbref1 = lines['DBREF1']
    dbref2 = lines['DBREF2']
    if len(dbref1) != len(dbref2):
        LOGGER.warn('DBREF1 and DBREF1 records are not complete')
        dbref12 = []
    else:
        dbref12 = zip(dbref1, dbref2)  # PY3K: OK

    for dbref1, dbref2 in dbref12:
        i, line = dbref1
        i += 1
        ch = line[12]

        dbabbr = line[26:32].strip()
        dbref = DBRef()
        dbref.dbabbr = dbabbr
        dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown')
        dbref.idcode = line[47:67].strip()

        try:
            first = int(line[14:18])
        except:
            LOGGER.warn('DBREF1 for chain {2}: failed to parse '
                        'initial sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            last = int(line[20:24])
        except:
            LOGGER.warn('DBREF1 for chain {2}: failed to parse '
                        'ending sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        i, line = dbref2
        i += 1
        if line[12] == ' ':
            LOGGER.warn('DBREF2 chain identifier is not specified '
                        '({0}:{1})'.format(pdbid, ch))
        elif line[12] != ch:
            LOGGER.warn('DBREF1 and DBREF2 chain id mismatch'
                        '({0}:{1})'.format(pdbid, ch))

        dbref.accession = line[18:40].strip()
        try:
            dbref.first = (first, line[18].strip(), int(line[45:55]))
        except:
            LOGGER.warn('DBREF2 for chain {2}: failed to parse '
                        'initial sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.last = (last, line[24].strip(), int(line[57:67]))
        except:
            LOGGER.warn('DBREF2 for chain {2}: failed to parse '
                        'ending sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))

        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.dbrefs.append(dbref)

    for poly in polymers.values():  # PY3K: OK
        resnum = []
        for dbref in poly.dbrefs:
            dbabbr = dbref.dbabbr
            if dbabbr == 'PDB':
                if not (pdbid == dbref.accession == dbref.idcode):
                    LOGGER.warn('DBREF for chain {2} refers to PDB '
                                'entry {3} ({0}:{1})'.format(
                                    pdbid, i, ch, dbref.accession))
            else:
                if pdbid == dbref.accession or pdbid == dbref.idcode:
                    LOGGER.warn('DBREF for chain {2} is {3}, '
                                'expected PDB ({0}:{1})'.format(
                                    pdbid, i, ch, dbabbr))
                    dbref.database = 'PDB'
            resnum.append((dbref.first[0], dbref.last[0]))
        resnum.sort()
        last = -10000
        for first, temp in resnum:
            if first <= last:
                LOGGER.warn('DBREF records overlap for chain {0} ({1})'.format(
                    poly.chid, pdbid))
            last = temp

    for i, line in lines['MODRES']:
        ch = line[16]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('MODRES chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        if poly.modified is None:
            poly.modified = []
        poly.modified.append(
            (line[12:15].strip(), line[18:22].strip() + line[22].strip(),
             line[24:27].strip(), line[29:70].strip()))

    for i, line in lines['SEQADV']:
        i += 1
        ch = line[16]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('MODRES chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        dbabbr = line[24:28].strip()
        resname = line[12:15].strip()
        try:
            resnum = int(line[18:22].strip())
        except:
            #LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence '
            #            'number ({0}:{1})'.format(pdbid, i, ch))
            continue
        icode = line[22].strip()
        try:
            dbnum = int(line[43:48].strip())
        except:
            #LOGGER.warn('SEQADV for chain {2}: failed to parse database '
            #            'sequence number ({0}:{1})'.format(pdbid, i, ch))
            continue

        comment = line[49:70].strip()
        match = False
        for dbref in poly.dbrefs:
            if not dbref.first[0] <= resnum <= dbref.last[0]:
                continue
            match = True
            if dbref.dbabbr != dbabbr:
                LOGGER.warn('SEQADV for chain {2}: reference database '
                            'mismatch, expected {3} parsed {4} '
                            '({0}:{1})'.format(pdbid, i,
                                               ch, repr(dbref.dbabbr),
                                               repr(dbabbr)))
                continue
            dbacc = line[29:38].strip()
            if dbref.accession[:9] != dbacc[:9]:
                LOGGER.warn('SEQADV for chain {2}: accession code '
                            'mismatch, expected {3} parsed {4} '
                            '({0}:{1})'.format(pdbid, i, ch,
                                               repr(dbref.accession),
                                               repr(dbacc)))
                continue
            dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment))
        if not match:
            LOGGER.warn('SEQADV for chain {2}: database sequence reference '
                        'not found ({0}:{1})'.format(pdbid, i, ch))
            continue

    string = ' '.join([line[10:].strip() for i, line in lines['COMPND']])
    if string.startswith('MOL_ID'):
        dict_ = {}
        for molecule in string[6:].split('MOL_ID'):
            dict_.clear()
            for token in molecule.split(';'):
                token = token.strip()
                if not token:
                    continue
                items = token.split(':', 1)
                if len(items) == 2:
                    key, value = items
                    dict_[key.strip()] = value.strip()

            chains = dict_.pop('CHAIN', '').strip()

            if not chains:
                continue
            for ch in chains.split(','):
                ch = ch.strip()
                poly = polymers.get(ch, Polymer(ch))
                polymers[ch] = poly
                poly.name = dict_.get('MOLECULE', '')

                poly.fragment = dict_.get('FRAGMENT', '')

                poly.comments = dict_.get('OTHER_DETAILS', '')

                val = dict_.get('SYNONYM', '')
                poly.synonyms = [s.strip()
                                 for s in val.split(',')] if val else []

                val = dict_.get('EC', '')
                poly.ec = [s.strip() for s in val.split(',')] if val else []

                poly.engineered = dict_.get('ENGINEERED', '') == 'YES'
                poly.mutation = dict_.get('MUTATION', '') == 'YES'

    return list(polymers.values())
示例#16
0
def fetchPDBviaHTTP(*pdb, **kwargs):
    """Retrieve PDB file(s) for specified *pdb* identifier(s) and return
    path(s).  Downloaded files will be stored in local PDB folder, if one
    is set using :meth:`.pathPDBFolder`, and copied into *folder*, if
    specified by the user.  If no destination folder is specified, files
    will be saved in the current working directory.  If *compressed* is
    **False**, decompressed files will be copied into *folder*."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))

    extension = '.pdb'
    local_folder = pathPDBFolder()
    if local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(filename,
                            join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(filename,
                            join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))


    getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us']

    success = 0
    failure = 0
    filenames = []
    for pdb in identifiers:
        if pdb is None:
            filenames.append(None)
            continue
        try:
            handle = openURL(getURL(pdb))
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err)))
            failure += 1
            filenames.append(None)
        else:
            data = handle.read()
            if len(data):
                filename = getPath(pdb)

                with open(filename, 'w+b') as pdbfile:
                    pdbfile.write(data)

                filename = normpath(relpath(second(filename, pdb)))
                LOGGER.debug('{0} downloaded ({1})'
                             .format(pdb, sympath(filename)))
                success += 1
                filenames.append(filename)
            else:
                LOGGER.warn('{0} download failed, reason unknown.'
                            .format(pdb))
                failure += 1
                filenames.append(None)
    LOGGER.debug('PDB download via HTTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
示例#17
0
def buildPDBEnsemble(atomics,
                     ref=None,
                     title='Unknown',
                     labels=None,
                     unmapped=None,
                     **kwargs):
    """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures 
    (:class:`.Atomic` instances). Note that the reference should be included in the list as well.

    :arg atomics: a list of :class:`.Atomic` instances
    :type atomics: list

    :arg ref: reference structure or the index to the reference in *atomics*. If **None**,
        then the first item in *atomics* will be considered as the reference. If it is a 
        :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble.
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets 
        (**False**) of each structure should be added to the ensemble. Default is **True**
    :type degeneracy: bool

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed
    :type occupancy: float

    :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an 
        output argument
    :type unmapped: list

    :arg subset: a subset for selecting particular atoms from the input structures.
        Default is ``"all"``
    :type subset: str

    :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to 
        superpose the structures, otherwise conformations will be superposed with respect 
        to the reference specified by *ref* unless set to ``False``. Default is ``'iter'``
    :type superpose: str, bool
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'all')).lower()
    superpose = kwargs.pop('superpose', 'iter')
    superpose = kwargs.pop('iterpose', superpose)
    debug = kwargs.pop('debug', {})

    if 'mapping_func' in kwargs:
        raise DeprecationWarning(
            'mapping_func is deprecated. Please see release notes for '
            'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html'
        )
    start = time.time()

    if not isListLike(atomics):
        raise TypeError('atomics should be list-like')

    if len(atomics) == 1 and degeneracy is True:
        raise ValueError('atomics should have at least two items')

    if labels is not None:
        if len(labels) != len(atomics):
            raise TypeError('Labels and atomics must have the same lengths.')
    else:
        labels = []

        for atoms in atomics:
            if atoms is None:
                labels.append(None)
            else:
                labels.append(atoms.getTitle())

    if ref is None:
        target = atomics[0]
    elif isinstance(ref, Integral):
        target = atomics[ref]
    elif isinstance(ref, PDBEnsemble):
        target = ref._atoms
    else:
        target = ref

    # initialize a PDBEnsemble with reference atoms and coordinates
    isrefset = False
    if isinstance(ref, PDBEnsemble):
        ensemble = ref
    else:
        # select the subset of reference beforehand for the sake of efficiency
        if subset != 'all':
            target = target.select(subset)
        ensemble = PDBEnsemble(title)
        if isinstance(target, Atomic):
            ensemble.setAtoms(target)
            ensemble.setCoords(target.getCoords())
            isrefset = True
        else:
            ensemble._n_atoms = len(target)
            isrefset = False

    # build the ensemble
    if unmapped is None: unmapped = []

    LOGGER.progress('Building the ensemble...', len(atomics),
                    '_prody_buildPDBEnsemble')
    for i, atoms in enumerate(atomics):
        if atoms is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % atoms.getTitle(),
                      label='_prody_buildPDBEnsemble')
        try:
            atoms.getHierView()
        except AttributeError:
            raise TypeError(
                'atomics must be a list of instances having the access to getHierView'
            )

        if subset != 'all':
            atoms = atoms.select(subset)

        # find the mapping of chains of atoms to those of target
        debug[labels[i]] = {}
        atommaps = alignChains(atoms, target, debug=debug[labels[i]], **kwargs)

        if len(atommaps) == 0:
            unmapped.append(labels[i])
            continue

        # add the atommaps to the ensemble
        for atommap in atommaps:
            lbl = pystr(labels[i])
            if len(atommaps) > 1:
                chids = np.unique(atommap.getChids())
                strchids = ''.join(chids)
                lbl += '_%s' % strchids
            ensemble.addCoordset(atommap,
                                 weights=atommap.getFlags('mapped'),
                                 label=lbl,
                                 degeneracy=degeneracy)

            if not isrefset:
                ensemble.setCoords(atommap.getCoords())
                isrefset = True

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)

    if superpose == 'iter':
        ensemble.iterpose()
    elif superpose is not False:
        ensemble.superpose()

    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format(
        ensemble.numConfs(),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
示例#18
0
def assignSecstr(header, atoms, coil=False):
    """Assign secondary structure from *header* dictionary to *atoms*.
    *header* must be a dictionary parsed using the :func:`.parsePDB`.
    *atoms* may be an instance of :class:`.AtomGroup`, :class:`.Selection`,
    :class:`.Chain` or :class:`.Residue`.  ProDy can be configured to
    automatically parse and assign secondary structure information using
    ``confProDy(auto_secondary=True)`` command.  See also :func:`.confProDy`
    function.

    The Dictionary of Protein Secondary Structure, in short DSSP, type
    single letter code assignments are used:

      * **G** = 3-turn helix (310 helix). Min length 3 residues.
      * **H** = 4-turn helix (alpha helix). Min length 4 residues.
      * **I** = 5-turn helix (pi helix). Min length 5 residues.
      * **T** = hydrogen bonded turn (3, 4 or 5 turn)
      * **E** = extended strand in parallel and/or anti-parallel
        beta-sheet conformation. Min length 2 residues.
      * **B** = residue in isolated beta-bridge (single pair beta-sheet
        hydrogen bond formation)
      * **S** = bend (the only non-hydrogen-bond based assignment).
      * **C** = residues not in one of above conformations.


    See http://en.wikipedia.org/wiki/Protein_secondary_structure#The_DSSP_code
    for more details.

    Following PDB helix classes are omitted:

      * Right-handed omega (2, class number)
      * Right-handed gamma (4)
      * Left-handed alpha (6)
      * Left-handed omega (7)
      * Left-handed gamma (8)
      * 2 - 7 ribbon/helix (9)
      * Polyproline (10)

    Secondary structures are assigned to all atoms in a residue.  Amino acid
    residues without any secondary structure assignments in the header
    section will be assigned coil (C) conformation.  This can be prevented
    by passing ``coil=False`` argument."""

    if not isinstance(header, dict):
        raise TypeError('header must be a dictionary')
    helix = header.get('helix', {})
    sheet = header.get('sheet', {})
    if len(helix) == 0 and len(sheet) == 0:
        LOGGER.warn('header does not contain secondary structure data')
        return atoms

    ssa = atoms.getSecstrs()
    if ssa is None:
        if isinstance(atoms, AtomGroup):
            ag = atoms
        else:
            ag = atoms.getAtomGroup()
        ag.setSecstrs(np.zeros(ag.numAtoms(),
                               ATOMIC_FIELDS['secondary'].dtype))
        ag.setSecids(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secid'].dtype))
        ag.setSecclasses(
            np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secclass'].dtype))
        ag.setSecindices(
            np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secindex'].dtype))

    prot = atoms.select('protein')
    if prot is not None:
        prot.setSecstrs('C')
    hierview = atoms.getHierView()
    count = 0
    getResidue = hierview.getResidue
    for key, value in helix.items():  # PY3K: OK
        res = getResidue(*key)
        if res is None:
            continue
        res.setSecids(value[2])
        res.setSecclasses(value[0])
        res.setSecindices(value[1])
        res.setSecstrs(mapHelix[value[0]])

        count += 1
    for key, value in sheet.items():  # PY3K: OK
        res = getResidue(*key)
        if res is None:
            continue
        res.setSecids(value[2])
        res.setSecclasses(value[0])
        res.setSecindices(value[1])
        res.setSecstrs('E')
        count += 1

    LOGGER.info(
        'Secondary structures were assigned to {0} residues.'.format(count))

    return atoms