def _fix(self, atoms): try: from pdbfixer import PDBFixer from openmm.app import PDBFile except ImportError: raise ImportError('Please install PDBFixer and OpenMM 7.6 in order to use ClustENM.') stream = createStringIO() title = atoms.getTitle() writePDBStream(stream, atoms) stream.seek(0) fixed = PDBFixer(pdbfile=stream) stream.close() fixed.missingResidues = {} fixed.findNonstandardResidues() fixed.replaceNonstandardResidues() fixed.removeHeterogens(False) fixed.findMissingAtoms() fixed.addMissingAtoms() fixed.addMissingHydrogens(self._ph) stream = createStringIO() PDBFile.writeFile(fixed.topology, fixed.positions, stream, keepIds=True) stream.seek(0) self._atoms = parsePDBStream(stream) self._atoms.setTitle(title) stream.close() self._topology = fixed.topology self._positions = fixed.positions
def pdbfix_protein(input_pdb_path, output_pdb_path, find_missing_residues=True, keep_water=False, ph=None): """Run PDBFixer on the input PDB file. Heterogen atoms are always removed. Parameters ---------- input_pdb_path : str The PDB to fix. output_pdb_path : str The path to the output PDB file. find_missing_residues : bool, optional If True, PDBFixer will try to model the unresolved residues that appear in the amino acid sequence (default is True). keep_water : bool, optional If True, water molecules are not stripped (default is False). ph : float or None, optional If not None, hydrogen atoms will be added at this pH. """ fixer = PDBFixer(filename=input_pdb_path) if find_missing_residues: fixer.findMissingResidues() else: fixer.missingResidues = {} fixer.findNonstandardResidues() fixer.replaceNonstandardResidues() fixer.removeHeterogens(keep_water) fixer.findMissingAtoms() fixer.addMissingAtoms() if ph is not None: fixer.addMissingHydrogens(ph) # print(fixer.nonstandardResidues) # print(fixer.missingAtoms) # print(fixer.missingTerminals) with open(output_pdb_path, 'w') as f: PDBFile.writeFile(fixer.topology, fixer.positions, f)
from simtk.openmm.app import * from simtk.openmm import * from simtk.unit import * from sys import stdout from pdbfixer import PDBFixer import numpy as np fixer = PDBFixer(filename='rna_ac.pdb') #fixer.findMissingResidues() fixer.missingResidues = {} # Pull out and save the coordinates of the desired ligand. #fixer.findMissingAtoms() #fixer.addMissingAtoms() #fixer.addMissingHydrogens(7.0) mnx = min([p[0] for p in fixer.positions])._value mny = min([p[1] for p in fixer.positions])._value mnz = min([p[2] for p in fixer.positions])._value fixer.positions._value = [ p - Vec3(mnx, mny, mnz) for p in fixer.positions._value ] maxSize = max( max((pos[i] for pos in fixer.positions)) - min((pos[i] for pos in fixer.positions)) for i in range(3)) boxSize = maxSize * Vec3(1, 1, 1) boxVectors = (maxSize * Vec3(1, 0, 0), maxSize * Vec3(0, 1, 0), maxSize * Vec3(0, 0, 1)) # # This is basically the pdbfixer code, but without the amber lines.
from simtk import unit from sys import stdout # clean up the original PDB file and add missing residues and heavy atoms fixer = PDBFixer('pdb4h12.ent') fixer.findMissingResidues() # only add missing residues in the middle of the chain, do not add terminal ones chains = list(fixer.topology.chains()) keys = fixer.missingResidues.keys() missingResidues = dict() for key in keys: chain = chains[key[0]] if not (key[1] == 0 or key[1] == len(list(chain.residues()))): missingResidues[key] = fixer.missingResidues[key] fixer.missingResidues = missingResidues fixer.findMissingAtoms() fixer.addMissingAtoms() PDBFile.writeFile(fixer.topology, fixer.positions, open('4h12_fixed.pdb', 'w')) # keep only protein and zinc ions traj = md.load('4h12_fixed.pdb') traj = traj.atom_slice(traj.top.select('(protein and not resname SAH) or resname ZN')) # implement changes necessary for the use of the dummy atom Zn2+ model # change residue name of the zincs from ZN to ZNB, and atom names from ZN to Zn for residue in traj.top.chain(1).residues: residue.name = 'ZNB' for atom in traj.top.chain(1).atoms:
def process_pdb(path, corr_path, chain_id, max_atoms, gsd_file, embedding_dicts, NN, nlist_model, keep_residues=[-1, 1], debug=False, units=unit.nanometer, frame_number=3, model_index=0, log_file=None, shiftx_style=False): global MA_LOST_FRAGS if shiftx_style: frame_number = 1 # load pdb pdb = app.PDBFile(path) # load cs sets peak_data, sequence_map, peak_seq = process_corr(corr_path, debug, shiftx_style) result = [] # check for weird/null chain if chain_id == '_': chain_id = list(pdb.topology.residues())[0].chain.id[0] # sometimes chains have extra characters (why?) residues = list( filter(lambda r: r.chain.id[0] == chain_id, pdb.topology.residues())) if len(residues) == 0: if debug: raise ValueError('Failed to find requested chain ', chain_id) pdb_offset, seq_offset = None, None # from pdb residue index to our aligned residue index residue_lookup = {} # bonded neighbor mask nlist_mask = None peak_count = 0 # select a random set of frames for generating data without replacement frame_choices = random.sample(range(0, pdb.getNumFrames()), k=min(pdb.getNumFrames(), frame_number)) for fi in frame_choices: peak_successes = set() # clean up individual frame frame = pdb.getPositions(frame=fi) # have to fix at each frame since inserted atoms may change # fix missing residues/atoms fixer = PDBFixer(filename=path) # overwrite positions with frame positions fixer.positions = frame # we want to add missing atoms, # but not replace missing residue. We'd # rather just ignore those fixer.findMissingResidues() # remove the missing residues fixer.missingResidues = [] # remove water! fixer.removeHeterogens(False) if not shiftx_style: fixer.findMissingAtoms() fixer.findNonstandardResidues() fixer.replaceNonstandardResidues() fixer.addMissingAtoms() fixer.addMissingHydrogens(7.0) # get new positions frame = fixer.positions num_atoms = len(frame) # remake residue list each time so they have correct atom ids residues = list( filter(lambda r: r.chain.id[0] == chain_id, fixer.topology.residues())) if num_atoms > 20000: MA_LOST_FRAGS += len(residues) if debug: print( 'Exceeded number of atoms for building nlist (change this if you have big GPU memory) in frame {} in pdb {}' .format(fi, path)) break # check alignment once if pdb_offset is None: # create sequence from residues pdb_seq = ['XXX'] * max([int(r.id) + 1 for r in residues]) for r in residues: rid = int(r.id) if rid >= 0: pdb_seq[int(r.id)] = r.name if debug: print('pdb_seq', pdb_seq) print('peak_seq', peak_seq) pdb_offset, seq_offset = align(pdb_seq, peak_seq, debug) #TOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOODDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDOOOOOOOOOOOOOOOOOOOOOOO????? # Maybe it's ok pdb_offset = 0 if debug: print('pdb_offset', pdb_offset) print('seq_offset', seq_offset) print(sequence_map) # now check alignment - rarely perfect saw_one = False aligned = 0 for i in range(len(residues)): segid = int(residues[i].id) + pdb_offset saw_one = pdb_seq[segid] == residues[i].name if not saw_one: print('Mismatch (A) at position {} ({}). {} != {}'. format(segid, residues[i].id, pdb_seq[segid], residues[i].name)) continue if segid + seq_offset in sequence_map: peakid = sequence_map[segid + seq_offset] print(segid, segid + seq_offset, len(pdb_seq), len(peak_seq)) saw_one = pdb_seq[segid] == peak_seq[segid + seq_offset] if not saw_one: print( 'Mismatch (B) at position {}. pdb seq: {}, peak seq: {}' .format(segid, peak_seq[segid + seq_offset], pdb_seq[peakid])) continue saw_one = peak_data[peakid]['name'] == residues[i].name if not saw_one: print( 'Mismatch (C) at position {}. peak seq: {}, peak data: {}, residue: {}' .format(segid, i, peak_seq[segid + seq_offset], peak_data[peakid]['name'], residues[i].name)) continue aligned += 1 if aligned < 5: raise ValueError( 'Could not find more than 5 aligned residues, very unusual' ) # create resiud look-up from atom index for i, r in enumerate(residues): for a in r.atoms(): residue_lookup[a.index] = i # This alignment will be checked as we compare shifts against the pdb # get neighbor list for frame np_pos = np.array([v.value_in_unit(units) for v in frame]) frame_nlist = nlist_model(np_pos) for ri in range(len(residues)): # we build up fragment by getting residues around us, both in chain # and those within a certain distance of us rmin = max(0, ri + keep_residues[0]) # have to +1 here (and not in range) to get min to work :) rmax = min(len(residues), ri + keep_residues[1] + 1) # do we have any residues to consider? success = rmax - rmin > 0 consider = set(range(rmin, rmax)) # Used to indicate an atom should be included from a different residue marked = [False for _ in range(len(frame))] # now grab spatial neighbor residues # NOTE: I checked this by hand a lot # Believe this code. for a in residues[ri].atoms(): for ni in range(NN): j = int(frame_nlist[a.index, ni, 1]) try: consider.add(residue_lookup[j]) marked[j] = True except KeyError as e: success = False if debug: print( 'Neighboring residue in different chain, skipping' ) break atoms = np.zeros((max_atoms), dtype=np.int64) # we will put dummy atom at end to keep bond counts the same by bonding to it # Z-DISABLED #atoms[-1] = embedding_dicts['atom']['Z'] mask = np.zeros((max_atoms), dtype=np.float) bonds = np.zeros((max_atoms, max_atoms), dtype=np.int64) # nlist: # :,:,0 -> distance # :,:,1 -> neighbor index # :,:,2 -> bond count nlist = np.zeros((max_atoms, NEIGHBOR_NUMBER, 3), dtype=np.float) positions = np.zeros((max_atoms, 3), dtype=np.float) peaks = np.zeros((max_atoms), dtype=np.float) names = np.zeros((max_atoms), dtype=np.int64) # going from pdb atom index to index in these data structures rmap = dict() index = 0 # check our two conditions that could have made this false: there are residues and # we didn't have off-chain spatial neighboring residues if not success: continue for rj in consider: residue = residues[rj] # use the alignment result to get offset segid = int(residue.id) + pdb_offset if segid + seq_offset not in sequence_map: if debug: print('Could not find residue index', rj, ': ', residue, 'in the sequence map. Its index is', segid + seq_offset, 'ri: ', ri) print('We are considering', consider) success = False break peak_id = sequence_map[segid + seq_offset] #peak_id = segid if peak_id >= len(peak_data): success = False if debug: print('peakd id is outside of peak range') break # only check for residue we actually care about if ri == rj and residue.name != peak_data[peak_id]['name']: if debug: print('Mismatch between residue ', ri, rj, peak_id, residue, segid, peak_data[peak_id], path, corr_path, chain_id) success = False break for atom in residue.atoms(): # Make sure atom is in residue or neighbor of residue atom if ri != rj and not marked[atom.index]: continue mask[index] = float(ri == rj) atom_name = residue.name + '-' + atom.name if atom_name not in embedding_dicts['name']: embedding_dicts['name'][atom_name] = len( embedding_dicts['name']) names[index] = embedding_dicts['name'][atom_name] if atom.element.symbol not in embedding_dicts['atom']: if debug: print('Could not identify atom', atom.element.symbol) success = False break atoms[index] = embedding_dicts['atom'][atom.element.symbol] positions[index] = np_pos[atom.index, :] rmap[atom.index] = index peaks[index] = 0 if mask[index]: if atom.name[:3] in peak_data[peak_id]: peaks[index] = peak_data[peak_id][atom.name[:3]] peak_count += 1 peak_successes.add(peak_id) else: mask[index] = 0 index += 1 # Z-DISABLED # -1 for dummy atom which is stored at end if index == max_atoms - 1: #2: MA_LOST_FRAGS += 1 if debug: print('Not enough space for all atoms in ri', ri) success = False break if ri == rj and sum(mask) == 0: if debug: print('Warning found no peaks for', ri, rj, residue, peak_data[peak_id]) success = False if not success: break if not success: continue # do this after so our reverse mapping is complete for rj in consider: residue = residues[rj] for b in residue.bonds(): # set bonds try: bonds[rmap[b.atom1.index], rmap[b.atom2.index]] = 1 bonds[rmap[b.atom2.index], rmap[b.atom1.index]] = 1 except KeyError: # for bonds that cross residue pass for rj in consider: residue = residues[rj] for a in residue.atoms(): # Make sure atom is in residue or neighbor of residue atom if ri != rj and not marked[a.index]: continue index = rmap[a.index] # convert to local indices and filter neighbors n_index = 0 for ni in range(NN): if frame_nlist[a.index, ni, 0] > 50.0: # large distances are sentinels for things # like self neighbors continue try: j = rmap[int(frame_nlist[a.index, ni, 1])] except KeyError: # either we couldn't find a neighbor on the root residue (which is bad) # or just one of the neighbors is not on a considered residue. if rj == ri: success = False if debug: print('Could not find all neighbors', int(frame_nlist[a.index, ni, 1]), consider) break # Z-DISABLED #j = max_atoms - 1 # point to dummy atom continue # mark as not a neighbor if out of molecule (only for non-subject nlists) if False and j == max_atoms - 1: #set index nlist[index, n_index, 1] = j # set distance nlist[index, n_index, 0] = frame_nlist[a.index, ni, 0] #set type nlist[index, n_index, 2] = embedding_dicts['nlist']['none'] n_index += 1 # a 0 -> non-bonded elif bonds[index, j] == 0: #set index nlist[index, n_index, 1] = j # set distance nlist[index, n_index, 0] = frame_nlist[a.index, ni, 0] #set type nlist[index, n_index, 2] = embedding_dicts['nlist']['nonbonded'] n_index += 1 # single bonded else: #set index nlist[index, n_index, 1] = j # set distance nlist[index, n_index, 0] = frame_nlist[a.index, ni, 0] #set type nlist[index, n_index, 2] = embedding_dicts['nlist'][1] n_index += 1 if n_index == NEIGHBOR_NUMBER: break # how did we do on peaks if False and (peaks[index] > 0 and peaks[index] < 25): nonbonded_count = np.sum( nlist[index, :, 2] == embedding_dicts['nlist']['nonbonded']) bonded_count = np.sum( nlist[index, :, 2] == embedding_dicts['nlist'][1]) print( 'neighbor summary: non-bonded: {}, bonded: {}, total: {}' .format(nonbonded_count, bonded_count, NEIGHBOR_NUMBER)) print(nlist[index, :, :]) exit() if not success: if debug: raise RuntimeError() continue if gsd_file is not None: snapshot = write_record_traj( positions, atoms, mask, nlist, peaks, embedding_dicts['class'][residues[ri].name], names, embedding_dicts) snapshot.configuration.step = len(gsd_file) gsd_file.append(snapshot) result.append( make_tfrecord(atoms, mask, nlist, peaks, embedding_dicts['class'][residues[ri].name], names, indices=np.array( [model_index, fi, int(residues[ri].id)], dtype=np.int64))) if log_file is not None: log_file.write('{} {} {} {} {} {} {} {}\n'.format( path.split('/')[-1], corr_path.split('/')[-1], chain_id, len(peak_successes), len(gsd_file), model_index, fi, residues[ri].id)) return result, len(peak_successes) / len(peak_data), len( result), peak_count