예제 #1
0
파일: clustenm.py 프로젝트: SHZ66/ProDy
    def _fix(self, atoms):

        try:
            from pdbfixer import PDBFixer
            from openmm.app import PDBFile
        except ImportError:
            raise ImportError('Please install PDBFixer and OpenMM 7.6 in order to use ClustENM.')

        stream = createStringIO()
        title = atoms.getTitle()
        writePDBStream(stream, atoms)
        stream.seek(0)
        fixed = PDBFixer(pdbfile=stream)
        stream.close()

        fixed.missingResidues = {}
        fixed.findNonstandardResidues()
        fixed.replaceNonstandardResidues()
        fixed.removeHeterogens(False)
        fixed.findMissingAtoms()
        fixed.addMissingAtoms()
        fixed.addMissingHydrogens(self._ph)

        stream = createStringIO()
        PDBFile.writeFile(fixed.topology, fixed.positions,
                          stream, keepIds=True)
        stream.seek(0)
        self._atoms = parsePDBStream(stream)
        self._atoms.setTitle(title)
        stream.close()

        self._topology = fixed.topology
        self._positions = fixed.positions
예제 #2
0
def pdbfix_protein(input_pdb_path,
                   output_pdb_path,
                   find_missing_residues=True,
                   keep_water=False,
                   ph=None):
    """Run PDBFixer on the input PDB file.

    Heterogen atoms are always removed.

    Parameters
    ----------
    input_pdb_path : str
        The PDB to fix.
    output_pdb_path : str
        The path to the output PDB file.
    find_missing_residues : bool, optional
        If True, PDBFixer will try to model the unresolved residues
        that appear in the amino acid sequence (default is True).
    keep_water : bool, optional
        If True, water molecules are not stripped (default is False).
    ph : float or None, optional
        If not None, hydrogen atoms will be added at this pH.

    """
    fixer = PDBFixer(filename=input_pdb_path)
    if find_missing_residues:
        fixer.findMissingResidues()
    else:
        fixer.missingResidues = {}
    fixer.findNonstandardResidues()
    fixer.replaceNonstandardResidues()
    fixer.removeHeterogens(keep_water)
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    if ph is not None:
        fixer.addMissingHydrogens(ph)

    # print(fixer.nonstandardResidues)
    # print(fixer.missingAtoms)
    # print(fixer.missingTerminals)

    with open(output_pdb_path, 'w') as f:
        PDBFile.writeFile(fixer.topology, fixer.positions, f)
예제 #3
0
from simtk.openmm.app import *
from simtk.openmm import *
from simtk.unit import *
from sys import stdout
from pdbfixer import PDBFixer
import numpy as np

fixer = PDBFixer(filename='rna_ac.pdb')
#fixer.findMissingResidues()
fixer.missingResidues = {}
# Pull out and save the coordinates of the desired ligand.
#fixer.findMissingAtoms()
#fixer.addMissingAtoms()
#fixer.addMissingHydrogens(7.0)
mnx = min([p[0] for p in fixer.positions])._value
mny = min([p[1] for p in fixer.positions])._value
mnz = min([p[2] for p in fixer.positions])._value
fixer.positions._value = [
    p - Vec3(mnx, mny, mnz) for p in fixer.positions._value
]
maxSize = max(
    max((pos[i]
         for pos in fixer.positions)) - min((pos[i]
                                             for pos in fixer.positions))
    for i in range(3))
boxSize = maxSize * Vec3(1, 1, 1)
boxVectors = (maxSize * Vec3(1, 0, 0), maxSize * Vec3(0, 1, 0),
              maxSize * Vec3(0, 0, 1))

#
# This is basically the pdbfixer code, but without the amber lines.
예제 #4
0
from simtk import unit
from sys import stdout

# clean up the original PDB file and add missing residues and heavy atoms
fixer = PDBFixer('pdb4h12.ent')

fixer.findMissingResidues()
# only add missing residues in the middle of the chain, do not add terminal ones
chains = list(fixer.topology.chains())
keys = fixer.missingResidues.keys()
missingResidues = dict()
for key in keys:
    chain = chains[key[0]]
    if not (key[1] == 0 or key[1] == len(list(chain.residues()))):
        missingResidues[key] = fixer.missingResidues[key]
fixer.missingResidues = missingResidues

fixer.findMissingAtoms()
fixer.addMissingAtoms()

PDBFile.writeFile(fixer.topology, fixer.positions, open('4h12_fixed.pdb', 'w'))

# keep only protein and zinc ions
traj = md.load('4h12_fixed.pdb')
traj = traj.atom_slice(traj.top.select('(protein and not resname SAH) or resname ZN'))

# implement changes necessary for the use of the dummy atom Zn2+ model
# change residue name of the zincs from ZN to ZNB, and atom names from ZN to Zn
for residue in traj.top.chain(1).residues:
    residue.name = 'ZNB'
for atom in traj.top.chain(1).atoms:
예제 #5
0
def process_pdb(path,
                corr_path,
                chain_id,
                max_atoms,
                gsd_file,
                embedding_dicts,
                NN,
                nlist_model,
                keep_residues=[-1, 1],
                debug=False,
                units=unit.nanometer,
                frame_number=3,
                model_index=0,
                log_file=None,
                shiftx_style=False):

    global MA_LOST_FRAGS
    if shiftx_style:
        frame_number = 1
    # load pdb
    pdb = app.PDBFile(path)

    # load cs sets
    peak_data, sequence_map, peak_seq = process_corr(corr_path, debug,
                                                     shiftx_style)

    result = []
    # check for weird/null chain
    if chain_id == '_':
        chain_id = list(pdb.topology.residues())[0].chain.id[0]
    # sometimes chains have extra characters (why?)
    residues = list(
        filter(lambda r: r.chain.id[0] == chain_id, pdb.topology.residues()))
    if len(residues) == 0:
        if debug:
            raise ValueError('Failed to find requested chain ', chain_id)

    pdb_offset, seq_offset = None, None

    # from pdb residue index to our aligned residue index
    residue_lookup = {}
    # bonded neighbor mask
    nlist_mask = None
    peak_count = 0
    # select a random set of frames for generating data without replacement
    frame_choices = random.sample(range(0, pdb.getNumFrames()),
                                  k=min(pdb.getNumFrames(), frame_number))
    for fi in frame_choices:
        peak_successes = set()
        # clean up individual frame
        frame = pdb.getPositions(frame=fi)
        # have to fix at each frame since inserted atoms may change
        # fix missing residues/atoms
        fixer = PDBFixer(filename=path)
        # overwrite positions with frame positions
        fixer.positions = frame
        # we want to add missing atoms,
        # but not replace missing residue. We'd
        # rather just ignore those
        fixer.findMissingResidues()
        # remove the missing residues
        fixer.missingResidues = []
        # remove water!
        fixer.removeHeterogens(False)
        if not shiftx_style:
            fixer.findMissingAtoms()
            fixer.findNonstandardResidues()
            fixer.replaceNonstandardResidues()
            fixer.addMissingAtoms()
            fixer.addMissingHydrogens(7.0)
        # get new positions
        frame = fixer.positions
        num_atoms = len(frame)
        # remake residue list each time so they have correct atom ids
        residues = list(
            filter(lambda r: r.chain.id[0] == chain_id,
                   fixer.topology.residues()))
        if num_atoms > 20000:
            MA_LOST_FRAGS += len(residues)
            if debug:
                print(
                    'Exceeded number of atoms for building nlist (change this if you have big GPU memory) in frame {} in pdb {}'
                    .format(fi, path))
            break
        # check alignment once
        if pdb_offset is None:
            # create sequence from residues
            pdb_seq = ['XXX'] * max([int(r.id) + 1 for r in residues])
            for r in residues:
                rid = int(r.id)
                if rid >= 0:
                    pdb_seq[int(r.id)] = r.name
            if debug:
                print('pdb_seq', pdb_seq)
                print('peak_seq', peak_seq)
            pdb_offset, seq_offset = align(pdb_seq, peak_seq, debug)
            #TOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOODDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDOOOOOOOOOOOOOOOOOOOOOOO?????
            # Maybe it's ok
            pdb_offset = 0
            if debug:
                print('pdb_offset', pdb_offset)
                print('seq_offset', seq_offset)
                print(sequence_map)
                # now check alignment - rarely perfect
                saw_one = False
                aligned = 0
                for i in range(len(residues)):
                    segid = int(residues[i].id) + pdb_offset
                    saw_one = pdb_seq[segid] == residues[i].name
                    if not saw_one:
                        print('Mismatch (A) at position {} ({}). {} != {}'.
                              format(segid, residues[i].id, pdb_seq[segid],
                                     residues[i].name))
                        continue
                    if segid + seq_offset in sequence_map:
                        peakid = sequence_map[segid + seq_offset]
                        print(segid, segid + seq_offset, len(pdb_seq),
                              len(peak_seq))
                        saw_one = pdb_seq[segid] == peak_seq[segid +
                                                             seq_offset]
                        if not saw_one:
                            print(
                                'Mismatch (B) at position {}. pdb seq: {}, peak seq: {}'
                                .format(segid, peak_seq[segid + seq_offset],
                                        pdb_seq[peakid]))
                            continue
                        saw_one = peak_data[peakid]['name'] == residues[i].name
                        if not saw_one:
                            print(
                                'Mismatch (C) at position {}. peak seq: {}, peak data: {}, residue: {}'
                                .format(segid, i, peak_seq[segid + seq_offset],
                                        peak_data[peakid]['name'],
                                        residues[i].name))
                            continue
                        aligned += 1
                if aligned < 5:
                    raise ValueError(
                        'Could not find more than 5 aligned residues, very unusual'
                    )

            # create resiud look-up from atom index
            for i, r in enumerate(residues):
                for a in r.atoms():
                    residue_lookup[a.index] = i
            # This alignment will be checked as we compare shifts against the pdb
        # get neighbor list for frame
        np_pos = np.array([v.value_in_unit(units) for v in frame])
        frame_nlist = nlist_model(np_pos)

        for ri in range(len(residues)):
            # we build up fragment by getting residues around us, both in chain
            # and those within a certain distance of us
            rmin = max(0, ri + keep_residues[0])
            # have to +1 here (and not in range) to get min to work :)
            rmax = min(len(residues), ri + keep_residues[1] + 1)
            # do we have any residues to consider?
            success = rmax - rmin > 0

            consider = set(range(rmin, rmax))

            # Used to indicate an atom should be included from a different residue
            marked = [False for _ in range(len(frame))]

            # now grab spatial neighbor residues
            # NOTE: I checked this by hand a lot
            # Believe this code.
            for a in residues[ri].atoms():
                for ni in range(NN):
                    j = int(frame_nlist[a.index, ni, 1])
                    try:
                        consider.add(residue_lookup[j])
                        marked[j] = True
                    except KeyError as e:
                        success = False
                        if debug:
                            print(
                                'Neighboring residue in different chain, skipping'
                            )
                        break
            atoms = np.zeros((max_atoms), dtype=np.int64)
            # we will put dummy atom at end to keep bond counts the same by bonding to it
            # Z-DISABLED
            #atoms[-1] = embedding_dicts['atom']['Z']
            mask = np.zeros((max_atoms), dtype=np.float)
            bonds = np.zeros((max_atoms, max_atoms), dtype=np.int64)
            # nlist:
            # :,:,0 -> distance
            # :,:,1 -> neighbor index
            # :,:,2 -> bond count
            nlist = np.zeros((max_atoms, NEIGHBOR_NUMBER, 3), dtype=np.float)
            positions = np.zeros((max_atoms, 3), dtype=np.float)
            peaks = np.zeros((max_atoms), dtype=np.float)
            names = np.zeros((max_atoms), dtype=np.int64)
            # going from pdb atom index to index in these data structures
            rmap = dict()
            index = 0
            # check our two conditions that could have made this false: there are residues and
            # we didn't have off-chain spatial neighboring residues
            if not success:
                continue
            for rj in consider:
                residue = residues[rj]
                # use the alignment result to get offset
                segid = int(residue.id) + pdb_offset
                if segid + seq_offset not in sequence_map:
                    if debug:
                        print('Could not find residue index', rj, ': ',
                              residue, 'in the sequence map. Its index is',
                              segid + seq_offset, 'ri: ', ri)
                        print('We are considering', consider)
                    success = False
                    break
                peak_id = sequence_map[segid + seq_offset]
                #peak_id = segid
                if peak_id >= len(peak_data):
                    success = False
                    if debug:
                        print('peakd id is outside of peak range')
                    break
                # only check for residue we actually care about
                if ri == rj and residue.name != peak_data[peak_id]['name']:
                    if debug:
                        print('Mismatch between residue ', ri, rj, peak_id,
                              residue, segid, peak_data[peak_id], path,
                              corr_path, chain_id)
                    success = False
                    break
                for atom in residue.atoms():
                    # Make sure atom is in residue or neighbor of residue atom
                    if ri != rj and not marked[atom.index]:
                        continue
                    mask[index] = float(ri == rj)
                    atom_name = residue.name + '-' + atom.name
                    if atom_name not in embedding_dicts['name']:
                        embedding_dicts['name'][atom_name] = len(
                            embedding_dicts['name'])
                    names[index] = embedding_dicts['name'][atom_name]

                    if atom.element.symbol not in embedding_dicts['atom']:
                        if debug:
                            print('Could not identify atom',
                                  atom.element.symbol)
                        success = False
                        break
                    atoms[index] = embedding_dicts['atom'][atom.element.symbol]
                    positions[index] = np_pos[atom.index, :]
                    rmap[atom.index] = index
                    peaks[index] = 0
                    if mask[index]:
                        if atom.name[:3] in peak_data[peak_id]:
                            peaks[index] = peak_data[peak_id][atom.name[:3]]
                            peak_count += 1
                            peak_successes.add(peak_id)
                        else:
                            mask[index] = 0
                    index += 1
                    # Z-DISABLED
                    # -1 for dummy atom which is stored at end
                    if index == max_atoms - 1:  #2:
                        MA_LOST_FRAGS += 1
                        if debug:
                            print('Not enough space for all atoms in ri', ri)
                        success = False
                        break
                if ri == rj and sum(mask) == 0:
                    if debug:
                        print('Warning found no peaks for', ri, rj, residue,
                              peak_data[peak_id])
                    success = False
                if not success:
                    break
            if not success:
                continue
            # do this after so our reverse mapping is complete
            for rj in consider:
                residue = residues[rj]
                for b in residue.bonds():
                    # set bonds
                    try:
                        bonds[rmap[b.atom1.index], rmap[b.atom2.index]] = 1
                        bonds[rmap[b.atom2.index], rmap[b.atom1.index]] = 1
                    except KeyError:
                        # for bonds that cross residue
                        pass
            for rj in consider:
                residue = residues[rj]
                for a in residue.atoms():
                    # Make sure atom is in residue or neighbor of residue atom
                    if ri != rj and not marked[a.index]:
                        continue
                    index = rmap[a.index]
                    # convert to local indices and filter neighbors
                    n_index = 0
                    for ni in range(NN):
                        if frame_nlist[a.index, ni, 0] > 50.0:
                            # large distances are sentinels for things
                            # like self neighbors
                            continue
                        try:
                            j = rmap[int(frame_nlist[a.index, ni, 1])]
                        except KeyError:
                            # either we couldn't find a neighbor on the root residue (which is bad)
                            # or just one of the neighbors is not on a considered residue.
                            if rj == ri:
                                success = False
                                if debug:
                                    print('Could not find all neighbors',
                                          int(frame_nlist[a.index, ni, 1]),
                                          consider)
                                break
                            # Z-DISABLED
                            #j = max_atoms - 1 # point to dummy atom
                            continue
                        # mark as not a neighbor if out of molecule (only for non-subject nlists)
                        if False and j == max_atoms - 1:
                            #set index
                            nlist[index, n_index, 1] = j
                            # set distance
                            nlist[index, n_index, 0] = frame_nlist[a.index, ni,
                                                                   0]
                            #set type
                            nlist[index, n_index,
                                  2] = embedding_dicts['nlist']['none']
                            n_index += 1
                        # a 0 -> non-bonded
                        elif bonds[index, j] == 0:
                            #set index
                            nlist[index, n_index, 1] = j
                            # set distance
                            nlist[index, n_index, 0] = frame_nlist[a.index, ni,
                                                                   0]
                            #set type
                            nlist[index, n_index,
                                  2] = embedding_dicts['nlist']['nonbonded']
                            n_index += 1
                        # single bonded
                        else:
                            #set index
                            nlist[index, n_index, 1] = j
                            # set distance
                            nlist[index, n_index, 0] = frame_nlist[a.index, ni,
                                                                   0]
                            #set type
                            nlist[index, n_index,
                                  2] = embedding_dicts['nlist'][1]
                            n_index += 1
                        if n_index == NEIGHBOR_NUMBER:
                            break
                    # how did we do on peaks
                    if False and (peaks[index] > 0 and peaks[index] < 25):
                        nonbonded_count = np.sum(
                            nlist[index, :,
                                  2] == embedding_dicts['nlist']['nonbonded'])
                        bonded_count = np.sum(
                            nlist[index, :, 2] == embedding_dicts['nlist'][1])
                        print(
                            'neighbor summary: non-bonded: {}, bonded: {}, total: {}'
                            .format(nonbonded_count, bonded_count,
                                    NEIGHBOR_NUMBER))
                        print(nlist[index, :, :])
                        exit()
            if not success:
                if debug:
                    raise RuntimeError()
                continue
            if gsd_file is not None:
                snapshot = write_record_traj(
                    positions, atoms, mask, nlist, peaks,
                    embedding_dicts['class'][residues[ri].name], names,
                    embedding_dicts)
                snapshot.configuration.step = len(gsd_file)
                gsd_file.append(snapshot)
            result.append(
                make_tfrecord(atoms,
                              mask,
                              nlist,
                              peaks,
                              embedding_dicts['class'][residues[ri].name],
                              names,
                              indices=np.array(
                                  [model_index, fi,
                                   int(residues[ri].id)],
                                  dtype=np.int64)))
            if log_file is not None:
                log_file.write('{} {} {} {} {} {} {} {}\n'.format(
                    path.split('/')[-1],
                    corr_path.split('/')[-1], chain_id, len(peak_successes),
                    len(gsd_file), model_index, fi, residues[ri].id))
    return result, len(peak_successes) / len(peak_data), len(
        result), peak_count