def load_from_one_cath_pml_file(pml_file, scratch_path, superfamilies, dssp_path): '''Load data from a .pml file of superposed homologous superfamilies from the CATH database. ''' superfamilies.append([]) candidate_proteins = [] with open(pml_file, 'r') as f: while True: line = f.readline() if not line: break # Read one structure if line.strip().startswith('cmd.read_pdbstr'): pdb_lines = [line.strip()[19:].strip('\\')] pdb_id = '' while True: line = f.readline() if line.strip().startswith('"""'): pdb_id = line.strip()[5:12] break pdb_line = line.strip().strip('\\') if len(pdb_line) > 17: pdb_line = pdb_line[0:16] + ' ' + pdb_line[ 17:] # Remove all altLoc flags pdb_lines.append(pdb_line) # Remove all altLoc flags # Make a pdb file of the structure for DSSP analysis structure = structure_from_pdb_string('\n'.join(pdb_lines), pdb_id) # Store structures without chain breaks if len(topology.find_structure_chain_breaks(structure)) == 0: structure_path = os.path.join(scratch_path, pdb_id + '.pdb') io = PDB.PDBIO() io.set_structure(structure) io.save(structure_path) candidate_proteins.append({ 'structure': structure, 'path': structure_path }) for p in candidate_proteins: try: find_secondary_structures(p, dssp_path) except: continue superfamilies[-1].append( p) # Add a protein to a superfamily if there's no exception
def preparePdb(pdb_fname, out_pdb_fname): ''' Prepare the PDB file with only first model and redundancies cut out ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname): raise IOError('%s does not exist' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) # Inside the temporary dir with tempDir() as tmp_dir: # Temporary names for curated input and output files new_pdb_fname = 'query.pdb' out_tmp_fname = 'out.pdb' # If the original PDB is packed with gzip - unpack it into a new file if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') else: rfh = open(pdb_fname, 'r') try: with open(new_pdb_fname, 'w') as wfh: wfh.write(rfh.read()) finally: rfh.close() # Parse structure # Redirect standard output/error to a cStringIO, #so that PDBParser stops messing the output parser = Bio.PDB.PDBParser() err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh struct = parser.get_structure('query', new_pdb_fname) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # Output formatted info about PDBParser's work to a log s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ for chain in model: chid = chain.id last_rid = None for residue in chain: if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Save structure without hydrogens io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(new_pdb_fname) shutil.move(new_pdb_fname, out_pdb_fname) return out_pdb_fname
def save_structure(struct, name): file = '{}.pdb'.format(name) io = PDBIO() io.set_structure(struct) io.save(file) del io with open(file, 'r') as f: atoms = f.read() data = header() + atoms with open(file, 'w') as f: f.write(data)
def _fixate(self, buf): if self._path: trbuf = buf.get_traces()[0] fns = io.save([trbuf], self._path, format=self._format) self.remove_file(buf) if not self._forget_fixed: self.load_files(fns, show_progress=False, fileformat=self._format)
def _insert(self, iblock, traces): if traces: if self._storepath is not None: fns = io.save(traces, self._storepath, format='mseed', additional={'iblock': iblock}) self.load_files(fns, fileformat='mseed', show_progress=False) else: file = pile.MemTracesFile(None,traces) self.add_file(file)
def volume(v, name=_REMOTEVOLNAME): '''Plot a volume dataset in remote volume view ''' import tempfile import os #@Reimport tmp = tempfile.mkstemp('.dsr') # '/tmp/blah.dsr' os.close(tmp[0]) vdatafile = tmp[1] # convert to byte, int or float as volume viewer cannot cope with boolean, long or double datasets if v.dtype == _core.bool: v = _core.cast(v, _core.int8) elif v.dtype == _core.int64: v = _core.cast(v, _core.int32) elif v.dtype == _core.float64 or v.dtype == _core.complex64 or v.dtype == _core.complex128: v = _core.cast(v, _core.float32) _io.save(vdatafile, v, format='binary') _plot_volume(name, vdatafile) os.remove(vdatafile)
def get_sequence(pdb, chain): pdb_parser = PDBParser(PERMISSIVE=0) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb,pdb+".pdb") pdb_chain = pdb_structure[0][chain] i = 1 lista=[] for residue in pdb_chain: if i < int(sys.argv[3]) or i > int(sys.argv[4]): lista.append(residue.get_id()) #pdb_chain.detach_child(residue.get_id()) i+=1 for x in lista: pdb_chain.detach_child(x) io = PDBIO() io.set_structure(pdb_chain) output = sys.argv[5]+"_segment.pdb" io.save(output)
def _fixate(self, buf): trbuf = buf.get_traces()[0] if self._path: fns = io.save([trbuf], self._path, format=self._format) self._pile.remove_file(buf) if not self._forget_fixed: self._pile.load_files(fns, show_progress=False, fileformat=self._format) del self._states[trbuf.nslc_id]
def _insert(self, iblock, traces): if traces: if self._storepath is not None: fns = io.save(traces, self._storepath, format='mseed', additional={'iblock': iblock}) self.load_files(fns, fileformat='mseed', show_progress=False) else: file = pile.MemTracesFile(None, traces) self.add_file(file)
def extract_ligands(path): """ Extraction of the heteroatoms of .pdb files """ for pfb_file in os.listdir(path + 'pdbs/'): i = 1 if pfb_file.endswith('.pdb') and not pfb_file.startswith("lig_"): pdb_code = pfb_file[:-4] pdb = PDBParser().get_structure(pdb_code, path + 'pdbs/' + pfb_file) io = PDBIO() io.set_structure(pdb) model_selected = pdb[0] # for model in pdb: for chain in model_selected: for residue in chain: if not is_het(residue): continue print(f"saving {chain} {residue}") io.save(f"lig_{pdb_code}_{i}.pdb", ResidueSelect(chain, residue)) i += 1
def _fixate(self, buf, complete=True): trbuf = buf.get_traces()[0] del_state = True if self._path: if self._fixation_length is not None: ttmin = trbuf.tmin ytmin = util.year_start(ttmin) n = int(math.floor((ttmin - ytmin) / self._fixation_length)) tmin = ytmin + n * self._fixation_length traces = [] t = tmin while t <= trbuf.tmax: try: traces.append( trbuf.chop(t, t + self._fixation_length, inplace=False, snap=(math.ceil, math.ceil))) except trace.NoData: pass t += self._fixation_length if abs(traces[-1].tmax - (t - trbuf.deltat)) < trbuf.deltat / 100. or complete: self._pile.remove_file(buf) else: # reinsert incomplete last part new_trbuf = traces.pop() self._pile.remove_file(buf) buf.remove(trbuf) buf.add(new_trbuf) self._pile.add_file(buf) del_state = False else: traces = [trbuf] self._pile.remove_file(buf) fns = io.save(traces, self._path, format=self._format) if not self._forget_fixed: self._pile.load_files(fns, show_progress=False, fileformat=self._format) if del_state: del self._states[trbuf.nslc_id]
def _fixate(self, buf, complete=True): trbuf = buf.get_traces()[0] del_state = True if self._path: if self._fixation_length is not None: ttmin = trbuf.tmin ytmin = util.year_start(ttmin) n = int(math.floor((ttmin - ytmin) / self._fixation_length)) tmin = ytmin + n * self._fixation_length traces = [] t = tmin while t <= trbuf.tmax: try: traces.append( trbuf.chop(t, t + self._fixation_length, inplace=False, snap=(math.ceil, math.ceil)) ) except trace.NoData: pass t += self._fixation_length if abs(traces[-1].tmax - (t - trbuf.deltat)) < trbuf.deltat / 100.0 or complete: self._pile.remove_file(buf) else: # reinsert incomplete last part new_trbuf = traces.pop() self._pile.remove_file(buf) buf.remove(trbuf) buf.add(new_trbuf) self._pile.add_file(buf) del_state = False else: traces = [trbuf] self._pile.remove_file(buf) fns = io.save(traces, self._path, format=self._format) if not self._forget_fixed: self._pile.load_files(fns, show_progress=False, fileformat=self._format) if del_state: del self._states[trbuf.nslc_id]
def download_pdb(self, info): pdb_id, chain_id = info ## Check if atom has alternative position, if so, keep 'A' position and remove the flag ## but somehow this class doesn't seem to function well class NotDisordered(Select): def accept_atom(self, atom): if not atom.is_disordered() or atom.get_altloc() == 'A': atom.set_altloc(' ') return True else: return False ## BioPython downloads PDB but it gives a lowercase name in pdb{}.ent format biopdb_name = '{0}/pdb{1}.ent'.format(self.work_dir, pdb_id.lower()) biopdb_modf = '{0}/pdb{1}.mod.ent'.format(self.work_dir, pdb_id.lower()) if not os.path.isfile(biopdb_modf): try: PDB.PDBList(verbose=False).retrieve_pdb_file( pdb_id, pdir=self.work_dir, obsolete=False, file_format='pdb') except FileNotFoundError: print( ' \033[31m> ERROR: BioPython cannot download PDB: \033[0m' + pdb_id) return None ## Replace modified AA to avoid mis-recognition in biopython readin ## Replace disordered atoms and keep only the "A" variant ReplacePDBModifiedAA(biopdb_name, biopdb_modf) os.system('grep "REMARK " {0} > {0}.remark'.format(biopdb_modf)) with open(biopdb_modf, 'r') as fi: remarks = [l for l in fi if re.search('REMARK HET ', l)] ## Read the PDB file and extract the chain from structure[0] try: model = PDB.PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb_id, biopdb_modf)[0] except KeyError: print(' \033[31m> ERROR: BioPython cannot read in PDB: \033[0m' + biopdb_modf) return None except ValueError: print(' \033[31m> ERROR: PDB file is empty: \033[0m' + biopdb_modf) return None ### Bug alert: as of 20.02.18, Biopython dev hasn't come up with good ### strategy to fix the 'atom.disordered_get_list()' issue with alternative ### position of residue side chains. To go around this, will physically ### remove "B" variant and keep only "A" variant in io = PDB.PDBIO() io.set_structure(model[chain_id]) io.save('{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id), select=NotDisordered()) # Attach REMARK to end of PDB as safekeeping os.system('cat {0}/{1}_{2}.pdb {3}.remark > {1}.temp'.format( self.work_dir, pdb_id, chain_id, biopdb_modf)) os.system('mv {1}.temp {0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)) # os.system('mv {1} {0}/{2}.ent'.format(self.work_dir, biopdb_name, pdb_id)) # os.system('bzip2 -f {0}/{1}.ent'.format(self.work_dir, pdb_id)) # os.system('rm {0} {0}.remark'.format(biopdb_modf)) return '{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)
def prepareWithHydrogens(pdb_fname, out_pdb_fname="wth_hydro.pdb"): ''' Prepare the PDB file with hydrogen data (clean up and create a new one). ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname): raise IOError('%s does not exist or is not a file.' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') #print pdb_fname else: rfh = open(pdb_fname, 'r') try: # Parse structure parser = Bio.PDB.PDBParser() # Redirect standard output/error to a cStringIO, # so that PDBParser stops messing the output err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh struct = parser.get_structure('query', rfh) finally: # Restore streams sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # ... and close up rfh.close() # Output formatted info about PDBParser's work to a logger s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model # ... delete the rest model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ (into the logger, again) for chain in model: chid = chain.id last_rid = None for residue in chain: if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Prepare the remade hydrogens remakeHydrogens(struct) # Save structure if out_pdb_fname.endswith('.gz'): with closing(gzip.open(out_pdb_fname, 'w')) as wfh: io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(wfh) else: io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(out_pdb_fname) return out_pdb_fname
def prepareWithHydrogensPrep23(pdb_fname, out_pdb_fname="wth_hydro.pdb"): ''' Prepare the PDB file with hydrogen data (clean up and create a new one). ''' # 'Absolutize' the path names - rest is done in the temporary dir pdb_fname = os.path.abspath(pdb_fname) if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname): raise IOError('%s does not exist or is not a file.' % pdb_fname) out_pdb_fname = os.path.abspath(out_pdb_fname) # Inside the temporary dir with tempDir() as tmp_dir: # Temporary names for curated input and output files new_pdb_fname = 'query.pdb' out_tmp_fname = 'out.pdb' # Prepare the sources prep_exec = _preparePrepExec() # Copy the original file into our temporary directory # If the original PDB is packed with gzip - unpack it into a new file if pdb_fname.endswith('.gz'): rfh = gzip.open(pdb_fname, 'r') else: rfh = open(pdb_fname, 'r') try: with open(new_pdb_fname, 'w') as wfh: wfh.write(rfh.read()) finally: rfh.close() # Parse structure # Redirect standard output/error to a cStringIO, #so that PDBParser stops messing the output parser = Bio.PDB.PDBParser() err_fh = io.StringIO() sys.stdout = err_fh sys.stderr = err_fh with open(new_pdb_fname, 'r') as rfh: struct = parser.get_structure('query', rfh) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ # Output formatted info about PDBParser's work to a log s = err_fh.getvalue() if s.strip(): logging.info( "Structure parsing generated following error message(s): \n%s\n%s\n%s" % ('-' * 120, s, '-' * 120)) # By default use only first model model = struct[0] del struct.child_list[1:] # Check for discontinuities greater than 5 residues - warn about this _specifically_ for chain in model: chid = chain.id last_rid = None # Curate disordered residues keeping only the last chain.child_list = [residue for residue in chain] chain.child_dict = dict((residue.id, residue) for residue in chain) for residue in chain: # Curate disordered atoms keeeping only the last residue.child_list = [a for a in residue] residue.child_dict = dict((a.id, a) for a in residue) if last_rid is not None and rid > last_rid + 5: rid = residue.id[1] logging.warn( "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues." % (last_id, chain, rid, chain)) last_rid = rid # Save structure without hydrogens io = Bio.PDB.PDBIO() io.set_structure(struct) io.save(new_pdb_fname, NoHydroSelect()) # Run the preparation executable on the newly created PDB file if (subprocess.call("%s %s %s 1>tmp.out 2>tmp.err" % (prep_exec, new_pdb_fname, out_tmp_fname), shell=True) != 0): raise RuntimeError( 'Could not prepare corrected structure file for %s' % pdb_fname) # Fix the occupancies (creating the last and final temporary PDB file) final_fn = "final.pdb" #raw_input('WAITING...') with open(out_tmp_fname, 'r') as rfh: with open(final_fn, 'w') as wfh: for line in rfh: if line.startswith('ATOM'): print >> wfh, line[:-1] + " 0.00 0.00 C" else: print >> wfh, line, # Move the output file to the desired location shutil.move(final_fn, out_pdb_fname) return out_pdb_fname