def filter_molecules(molecules, filter_strings): """ Filter molecules to produce a new set of molecules containing only atoms that passed the filter(s) Args: molecules (list of PDB molecules): mmolecular data to filter filter_strings (list of strings): filters to apply, using PDB ATOM-style fields Returns: list of molecules containing filtered data. Notes: - filter strings are in the form `key=value,key=value,...`, where numerical ranges are INCLUSIVE and indicated using a dash to separate the start and end indices, e.g. `name=CA,resSeq=2-14` - if no atoms in a molecule pass the filter, an empty molecule results """ filters = {} for f in filter_strings: PDB.UpdateFilters(f, filters, '=', ',', '-') new_molecules = [] for mol in molecules: new_mol = PDB.FilterAtoms(mol, filters) new_molecules.append(new_mol) return new_molecules
def add_title_section(self): """ HEADER, TITLE, EXPDTA, AUTHOR """ ## add HEADER records header = PDB.HEADER() self.pdb_file.append(header) header["idCode"] = self.struct.structure_id self.set_from_cifdb(header, "depDate", "database_pdb_rev", "date_original") self.set_from_cifdb(header, "classification", "struct_keywords", "pdbx_keywords") ## add TITLE records try: struct_title = self.struct.cifdb["struct"]["title"] except KeyError: pass else: cont = 0 while len(struct_title): stx = struct_title[:60] struct_title = struct_title[60:] title = PDB.TITLE() self.pdb_file.append(title) cont += 1 if cont > 1: title["continuation"] = cont title["title"] = stx ## add EXPDTA records try: exptl_method = self.struct.cifdb["exptl"]["method"] except KeyError: pass else: expdta = PDB.EXPDTA() self.pdb_file.append(expdta) expdta["technique"] = exptl_method ## add AUTHOR records ## XXX: need to write a function to fix author names to PDB format try: audit_author = self.struct.cifdb["audit_author"] except KeyError: pass else: name_list = [] for cif_row in audit_author: try: name_list.append(cif_row["name"]) except KeyError: pass author = PDB.AUTHOR() self.pdb_file.append(author) author["authorList"] = ",".join(name_list)
def sumAccesbilityBS(ppocketatom, proteinASA, proteinRSA): cASA = PDB.PDB(proteinASA) cASA.get_lAtoms() cRSA = parseRSAfile.RSA(proteinRSA) cBS = PDB.PDB(ppocketatom) latomBS = cBS.get_lAtoms() dout = {} dout["sumASA"] = 0.0 for atomBS in latomBS: for atomProt in cASA.latom: if atomBS.chainID == atomProt.chainID and atomBS.name == atomProt.name and atomBS.resName == atomProt.resName and atomBS.serial == atomProt.serial: dout["sumASA"] += float(atomProt.Bfact) break dout["sumRSAabs"] = 0.0 dout["sumRSArel"] = 0.0 for res in cRSA.lres: for atomBS in latomBS: if atomBS.chainID == res.chainID and atomBS.resName == res.resName and atomBS.resSeq == res.resSeq: if res.ABSall != "N/A": dout["sumRSAabs"] += float(res.ABSall) if res.RELall != "N/A": dout["sumRSArel"] += float(res.RELall) break return dout
def reset_na2_pdb_resid(self, offset): pdb_name = path.join(self.mkcrd_folder, 'bdna2.1.pdb') f_backup = path.join(self.mkcrd_folder, 'bdna2.1.backup.pdb') copyfile(pdb_name, f_backup) print(f'{pdb_name} {f_backup}') reader = PDB.PDBReader(pdb_name, skip_header=2, skip_footer=1) for atom in reader.atomgroup: resid = atom.resid atom.set_resid(resid + offset) writer = PDB.PDBWriter(pdb_name, reader.atomgroup) writer.write_pdb() print(f'Reset {pdb_name} resid by offset {offset}!') print(f'Check by...\nvim {pdb_name}')
def prepare_log(model_list, out_file_name): """Create output log with models created. Method takes information from set_dihedral_angles_update to prepare a log with the modifications made to the initial structure. It also combines the output pdb files into a single pdb file with several states. Parameters ---------- data_list : List Otput from set_dihedral_angles_update. out_file_name : String file name for output log and pdb states file. Returns ------- None. """ out_file = open(out_file_name+'.log', 'w') models_name = [] out_file.write('{:<3} {:<20} Distance\n'.format('#', 'File name')) total_models = len(model_list) for model in range(total_models): model_name = model_list[model][0] distance = model_list[model][1] aa_number = model_list[model][2] angles = model_list[model][3] line = '{:<3} {:<20} {:<8.2f} {} {}\n'.format(model + 1, model_name, distance, aa_number, angles) out_file.write(line) models_name.append(model_name) out_file.close() # prepare pdb file with each model as a state out_file = out_file_name+'.pdb' PDB.print_models(models_name, out_file)
def add_miscellaneous_fatures_section(self): """SITE """ serial_num = 0 for site in self.struct.iter_sites(): num_fragments = len(site.fragment_dict_list) site_pdb = None key_index = 0 for frag_dict in site.fragment_dict_list: if site_pdb is None or key_index == 4: serial_num += 1 key_index = 0 site_pdb = PDB.SITE() self.pdb_file.append(site_pdb) site_pdb["serNum"] = serial_num site_pdb["siteID"] = site.site_id site_pdb["numRes"] = num_fragments chain_id = "chainID%d" % (key_index) res_name = "resName%d" % (key_index) res_seq = "seq%d" % (key_index) icode = "icode%d" % (key_index) site_pdb[chain_id] = frag_dict["chain_id"] site_pdb[res_name] = frag_dict["res_name"] try: site_pdb[res_seq], site_pdb[ icode] = Structure.fragment_id_split(frag_dict["frag_id"]) except KeyError: pass
def save_structures(fpath, structures, dat): """ Slightly different to the save routines in PDBD module; each structure has the same "internal" chainID values, and structures are separated into PDB 'MODEL' sections. We assume structures are lists of monomer ids. Args: fpath (string): path to PDB file for output structures (list of integer lists): sublists are individual structures, sublist elements are molecule indices for structure members Returns: list of integer lists, with each sublist denoting a trimer-of-dimers structure and sublist members denoting molecules indices in that structure """ chains = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz123456789' f = open(fpath, 'w') serial = 1 for structure in structures: chain_i = 0 serial = 1 # for if we wish to reset count each time (e.g. lots of structures as in the capsid!) print >> f, 'MODEL' for monomer_i in structure: for subunit_i in dat.monomers[monomer_i]: for a in dat.subunits[subunit_i]: a2 = dict(a) # modify a copy of the data, not the original a2['chainID'] = chains[chain_i % len(chains)] a2['serial'] = serial # change this to retain original serial? serial += 1 print >> f, PDB.MakePDBAtomLine(a2) print >> f, 'TER' chain_i += 1 print >> f, 'ENDMDL' f.close()
def computeFPI(self, clean=0): frameID = self.plig.split("/")[-1].split("_")[1].split(".")[0] pfileFPI = self.prout + "FPI_" + frameID + ".csv" # if file exsit => load PFI if path.exists(pfileFPI) and clean == 0: return pfileFPI fileFPI = open(pfileFPI, "w") # header fileFPI.write("Ligand and pocket res\tList residues in pocket\tFPI\n") # define residue pocket cPocket = PDB.PDB(self.pBS, hydrogen=1) cPocket.get_byres(onlyres=1) lres = cPocket.getListResForFPI() pyplif.get_FPI(pligPDB=self.plig, ppocketPDB=self.pBS, lres=lres, filout=fileFPI) fileFPI.close() return pfileFPI
def computeRMSFresBS(self, pr_MDout): # load BS in frame 0 l_pBS = listdir(pr_MDout + "BSs/") l_res = [] for pBS in l_pBS: cBS = PDB.PDB(pr_MDout + "BSs/" + pBS) dres = cBS.get_byres() for res in dres.keys(): nRes = res.split("_")[1] if not nRes in l_res: l_res.append(nRes) # rewrite RMSF with binding site presRMSF = pr_MDout + "RMSDs/residues/resRMSD" ldresRMSF = toolbox.matrixToList(presRMSF) # rewrting pfilout = pr_MDout + "RMSDs/residues/resRMSD_BS" filout = open(pfilout, "w") filout.write("NameRes\tall\tCa\tDmax\tBS\n") for dresRMSF in ldresRMSF: if dresRMSF["NameRes"] in l_res: BS = 1 else: BS = 0 filout.write("%s\t%s\t%s\t%s\t%s\n" % (dresRMSF["NameRes"], dresRMSF["all"], dresRMSF["Ca"], dresRMSF["Dmax"], BS)) filout.close() return pfilout
def add_atom_records(self): """With a default model set, output all the ATOM and associated records for the model. """ ## atom records for standard groups for chain in self.struct.iter_chains(): res = None for res in chain.iter_standard_residues(): for atm in res.iter_all_atoms(): self.add_ATOM("ATOM", atm) ## chain termination record if res: ter_rec = PDB.TER() self.pdb_file.append(ter_rec) res_seq, icode = Structure.fragment_id_split(res.fragment_id) ter_rec["serial"] = self.next_serial_number() ter_rec["resName"] = res.res_name ter_rec["chainID"] = res.chain_id ter_rec["resSeq"] = res_seq ter_rec["iCode"] = icode ## hetatm records for non-standard groups for chain in self.struct.iter_chains(): for frag in chain.iter_non_standard_residues(): for atm in frag.iter_all_atoms(): self.add_ATOM("HETATM", atm)
def add_primary_structure_section(self): """DBREF,SEQADV,SEQRES,MODRES """ for chain in self.struct.iter_chains(): if len(chain.sequence) == 0: continue sernum = 0 seq_len = len(chain.sequence) seq_index = 0 while seq_index < seq_len: seqres = PDB.SEQRES() self.pdb_file.append(seqres) sernum += 1 seqres["serNum"] = sernum seqres["chainID"] = chain.chain_id seqres["numRes"] = seq_len for field in [ "resName1", "resName2", "resName3", "resName4", "resName5", "resName6", "resName7", "resName8", "resName9", "resName10", "resName11", "resName12", "resName13" ]: try: seqres[field] = chain.sequence[seq_index] except IndexError: break seq_index += 1
def blast_pdb_local(fasta_string, num_hits=1000): import subprocess import os import shlex import StringIO import simtk.openmm.app as app blast_data = os.getenv("DATA_HOME") blast_query = 'blastp -db %s/pdbaa -max_target_seqs %d -outfmt' % (blast_data, num_hits) out_fmt = '7 qseqid sseqid evalue bitscore' blast_cmd = shlex.split(blast_query) blast_cmd.append(out_fmt) p = subprocess.Popen(blast_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) blast_aln, error = p.communicate(input=fasta_string) msmseeds = [] local_pdb_repo = os.getenv("PDB_HOME") for result in blast_aln.splitlines(): if result[0]!="#": res_data = result.split("\t") e_value = float(res_data[2]) template_chain_code = "_".join(res_data[1].split("|")[3:]) raw_template_pdb = _read_local_repository(local_pdb_repo, template_chain_code.split("_")[0]) template_fasta, pdb_resnums = _retrieve_fasta(template_chain_code) template_pdb = StringIO.StringIO() raw_template_pdbio = StringIO.StringIO(raw_template_pdb) raw_template_pdbio.seek(0) end_resnums = PDB.extract_residues_by_resnum(template_pdb,raw_template_pdbio, pdb_resnums, template_chain_code.split("_")[1]) template_pdb.seek(0) if template_pdb.len == 0: continue template_pdbfile = app.PDBFile(template_pdb) msmseeds.append(MSMSeed(fasta_string, template_fasta, template_pdbfile, e_value)) return msmseeds
def query(): _query = request.form['inputQuery'] print('pubmed query started') _result = pms.searchCoordinator(_query) _resultDict = _result.T.to_dict().values() print('pubmed query completed with ' + str(len(_resultDict)) + ' results.') print('PDB query started') _pdbResult = pdb.search(_query) _pdbDict = _pdbResult.T.to_dict().values() print('PDB query completed with ' + str(len(_pdbDict)) + ' results.') _summaryDict = {} _summaryDict["numStructures"] = str(len(_pdbDict)) _summaryDict["numPapers"] = str(len(_resultDict)) _pdbResult.dropna() _highResStructure = _pdbResult.loc[_pdbResult['Resolution'].replace( "", "1000").astype(float).idxmin()] _summaryDict["highResID"] = _highResStructure["PDB ID"] _summaryDict["highResolution"] = _highResStructure["Resolution"] _summaryDict["bestAuthor"] = _result["Senior Author"].mode()[0] return render_template('results.html', query=_query, result=_resultDict, pdb=_pdbDict, summary=_summaryDict)
def print_PDB(f, atoms, bonds, unit_length=None): """ Print geometrical information to PDB file format for visualization. Args: f (file) : output destination atoms (list of PDB-style atom) : vertices bonds (list of integer pairs) : vertex connections unit_length (integer) : number of consecutive vertices in a notional geometrical unit Returns: Nothing """ conect_format = 'CONECT%5d%5d' if unit_length == None: unit_length = len(atoms) counter = 0 for a in atoms: if (counter > 0) and (counter % unit_length == 0): print('TER ', file=f) line = PDB.MakePDBAtomLine(a) print(line, file=f) counter += 1 print('TER ', file=f) for b in bonds: line = conect_format % (b[0], b[1]) print(line, file=f)
def __init__(self, ppocket, pPDB): self.pprotein = pPDB self.ppocket = ppocket cPocket = PDB.PDB(ppocket) # not included hydrogen self.latoms = cPocket.get_lAtoms() self.byresall = cPocket.get_byres(onlyres=1)
def computeFPIres(self): pfileFPI = self.prFPI + "FPIres.csv" fileFPI = open(pfileFPI, "w") dout = {} #header fileFPI.write("Residue Name\tList residues in pocket\tFPI\n") # need to define one pocket by residues -> folder of pockets prpocket = self.prFPI + "Pockets/" lppocket = [] lplig = [] try: makedirs(prpocket) except: pass dres = self.CPDB.get_byres() for resatoms in dres.keys(): #Res as a ligand latomres = deepcopy(dres[resatoms]) PDB.changeRecoder(latomres, "HETATM") pligand = prpocket + resatoms + ".pdb" self.CPDB.writePDB(latoms=latomres, pfilout=pligand) lplig.append(pligand) #Define pocket latomspocket = self.CPDB.get_BSfromLatom(latomin=latomres) ppocket = prpocket + "pocket_" + resatoms + ".pdb" self.CPDB.writePDB(latoms=latomspocket, pfilout=ppocket) lppocket.append(ppocket) #format list of residues considered lresformated = PDB.convert_ListAtomtoList(latomspocket) #run FPI dout[resatoms] = pyplif.get_FPI(pligPDB=pligand, ppocketPDB=ppocket, lres=lresformated, filout=fileFPI) fileFPI.close() self.pfileFPI = pfileFPI self.FPI = dout
def getProtein(self, pdbid, filename=None): if filename == None: filename = findpdbfile(pdbid) if DO_REMOTE and filename == None: import fetchproteinfile import tempfile filename = tempfile.mktemp(".pdb") fetchproteinfile.fetchPDBFile(pdbid, filename) return PDB.readFile(filename)
def writePDB(universe, configuration, pdb_file_name): offset = None if universe is not None: configuration = universe.contiguousObjectConfiguration(None, configuration) pdb = PDB.PDBOutputFile(pdb_file_name, 'xplor') pdb.write(universe, configuration) sequence = pdb.atom_sequence pdb.close() return sequence
def RMSDProt(self): prRMSDprot = self.dMD["prRMSD"] + "protein/" pathFolder.createFolder(prRMSDprot) if not "prSuperMatrix" in dir(self): self.Superimpose(0) # pfilout pfilout = prRMSDprot + "protRMSD" if path.exists(pfilout): return else: filout = open(pfilout, "w") filout.write("Time\tRMSDall\tRMSDC\tDmax\n0\t0\t0\t0\n") # open reference frame nframeref = str("%05d" % (0)) pframeref = self.dMD["prframe"] + "frame_" + nframeref + ".pdb" cprotref = PDB.PDB(PDB_input=pframeref) cprotref.get_atomProt() i = self.stepFrame imax = float(self.MDtime) / float(self.timeframe) while i < imax: nframe2 = str("%05d" % (i)) pframe2 = self.dMD["prframe"] + "frame_" + nframe2 + ".pdb" cprot2 = PDB.PDB(PDB_input=pframe2) cprot2.get_atomProt() pmatrix = self.prSuperMatrix + str(nframeref) + "_" + str(nframe2) #apply matrix on frame 2 matrixload = toolbox.loadMatrixTMalign(pmatrix) for atomprot2 in cprot2.latomProt: atomprot2.applyMatrixRotTransloc(matrixload) lRMSD = calculate.RMSDTwoList(cprotref.latomProt, cprot2.latomProt) filout.write("%s\t%s\t%s\t%s\n" % (i / 100.0, lRMSD[0], lRMSD[1], lRMSD[2])) i += self.stepFrame filout.close() runExternalSoft.runscatterplotRMSD(pfilout)
def computeRMSDProt(self, pr_MDout): # load ligand in frame 0 cfram0 = PDB.PDB(pr_MDout + "framesMD/frame_00000.pdb") cfram0.get_lAtoms() pr_TMalign = pr_MDout + "RMSDs/superimpose/" l_pTMaling = listdir(pr_TMalign) dRMSD = {} for pTMalign in l_pTMaling: frame = pTMalign.split("_")[-1] #print frame dmatrixTMalign = toolbox.loadMatrixTMalign(pr_TMalign + pTMalign) cFrame = PDB.PDB("%sframesMD/frame_%s.pdb" % (pr_MDout, frame)) cFrame.get_lAtoms() for atomLig in cFrame.latom: atomLig.applyMatrixRotTransloc(dmatrixTMalign) RMSDframe = calculate.RMSDTwoList(cfram0.latom, cFrame.latom) dRMSD[frame] = RMSDframe # write the RMSD lig file pfilout = pr_MDout + "RMSDs/protein/protRMSD_all" filout = open(pfilout, "w") filout.write("Time\tRMSDall\tRMSDC\tDmax\n") filout.write("0.0\t0.0\t0.0\t0.0\n") i = 1 imax = len(dRMSD.keys()) while i <= imax: frame = str("%05d" % (i)) filout.write( "%.2f\t%s\t%s\t%s\n" % (i / 100.0, dRMSD[frame][0], dRMSD[frame][1], dRMSD[frame][2])) i = i + 1 filout.close() return pfilout
def ASAHydrophobicityPolarity(ppdbasa, pBS): cPDBasa = PDB.PDB(ppdbasa) latomASA = cPDBasa.get_lAtoms() cBS = PDB.PDB(pBS) latomBS = cBS.get_lAtoms() dcompute = {} dcompute["C"] = [] dcompute["O"] = [] dcompute["N"] = [] dcompute["Scys"] = [] dcompute["Smet"] = [] for atomBS in latomBS: for atomProt in latomASA: if atomBS.chainID == atomProt.chainID and atomBS.name == atomProt.name and atomBS.resName == atomProt.resName and atomBS.serial == atomProt.serial: if atomBS.element != "S": dcompute[atomBS.element].append(float(atomProt.Bfact)) else: if atomBS.resName == "CYS": dcompute["Scys"].append(float(atomProt.Bfact)) elif atomBS.resName == "MET": dcompute["Smet"].append(float(atomProt.Bfact)) else: print atomBS dddd break polarityASA = (sum(dcompute["O"]) + sum(dcompute["N"]) + sum( dcompute["Scys"])) / (sum(dcompute["O"]) + sum(dcompute["N"]) + sum( dcompute["Scys"]) + sum(dcompute["Smet"]) + sum(dcompute["C"])) hydrophobicityASA = (sum(dcompute["C"]) + sum(dcompute["Smet"])) / ( sum(dcompute["O"]) + sum(dcompute["N"]) + sum(dcompute["Scys"]) + sum(dcompute["Smet"]) + sum(dcompute["C"])) return [polarityASA, hydrophobicityASA]
def computeRMSDLig(self, pr_MDout): # load ligand in frame 0 clig0 = PDB.PDB(pr_MDout + "lig/LGD_00000.pdb") clig0.get_lAtoms() pr_TMalign = pr_MDout + "RMSDs/superimpose/" l_pTMaling = listdir(pr_TMalign) dRMSD = {} for pTMalign in l_pTMaling: frame = pTMalign.split("_")[-1] #print frame dmatrixTMalign = toolbox.loadMatrixTMalign(pr_TMalign + pTMalign) cligFrame = PDB.PDB("%slig/LGD_%s.pdb" % (pr_MDout, frame)) cligFrame.get_lAtoms() for atomLig in cligFrame.latom: atomLig.applyMatrixRotTransloc(dmatrixTMalign) RMSDframe = calculate.RMSDTwoList(clig0.latom, cligFrame.latom) dRMSD[frame] = RMSDframe[0] # write the RMSD lig file pfilout = pr_MDout + "RMSDs/ligand/ligRMSD" filout = open(pfilout, "w") filout.write("Time\tRMSD\n") filout.write("0.0\t0.0\n") i = 1 imax = len(dRMSD.keys()) while i <= imax: frame = str("%05d" % (i)) filout.write("%.2f\t%s\n" % (i / 100.0, dRMSD[frame])) i = i + 1 filout.close() return pfilout
def add_crystallographic_coordinate_transformation_section(self): """CRYST1,ORIGXn,SCALEn,MTRIXn,TVECT """ cryst1 = PDB.CRYST1() self.pdb_file.append(cryst1) unit_cell = self.struct.unit_cell cryst1["a"] = self.struct.unit_cell.a cryst1["b"] = self.struct.unit_cell.b cryst1["c"] = self.struct.unit_cell.c cryst1["alpha"] = self.struct.unit_cell.calc_alpha_deg() cryst1["beta"] = self.struct.unit_cell.calc_beta_deg() cryst1["gamma"] = self.struct.unit_cell.calc_gamma_deg() cryst1["sgroup"] = self.struct.unit_cell.space_group.pdb_name
def add_coordinate_section(self): """ MODEL,ATOM,SIGATM,ANISOU,SIGUIJ,TER,HETATM,ENDMDL """ if len(self.struct.model_list) > 1: ## case 1: multiple models orig_model = self.struct.default_model for model in self.struct.iter_models(): self.struct.default_model = model model_rec = PDB.MODEL() self.pdb_file.append(model_rec) model_rec["serial"] = model.model_id self.add_atom_records() endmdl = PDB.ENDMDL() self.pdb_file.append(endmdl) self.struct.default_model = orig_model else: ## case 2: single model self.add_atom_records()
def extractLigBSbyFrame(self, BSCutoff, namelig, clean=0): c = 1 for jobname in self.lMD.keys(): print c, jobname if "prframe" in self.lMD[jobname].keys(): self.lMD[jobname]["prBSs"] = self.pranalysis + str( jobname) + "/BSs/" pathFolder.createFolder(self.lMD[jobname]["prBSs"], clean=clean) self.lMD[jobname]["prLig"] = self.pranalysis + str( jobname) + "/lig/" pathFolder.createFolder(self.lMD[jobname]["prLig"], clean=clean) lpframe = [ self.lMD[jobname]["prframe"] + i for i in listdir(self.lMD[jobname]["prframe"]) ] nb_frame = len(listdir(self.lMD[jobname]["prframe"])) if len(listdir( self.lMD[jobname]["prLig"])) >= nb_frame and len( listdir(self.lMD[jobname]["prBSs"])) >= nb_frame: c += 1 print "=> pass" continue else: for pframe in lpframe: cPDB = PDB.PDB(pframe, hydrogen=1) latomlig = cPDB.get_lig(namelig) cPDB.get_BSfromlig(dpocket=BSCutoff) # add step of rename atom pLGD = self.lMD[jobname][ "prLig"] + "LGD_" + pframe.split("_")[-1] pBS = self.lMD[jobname][ "prBSs"] + "BS_" + pframe.split("_")[-1] cPDB.writePDB(pLGD, latomlig, conect=1) cPDB.writePDB(pBS, cPDB.pocketsRES["UNK_900_A"] ) # default in schrodinger c += 1
def addResidueToResidueList(curr_residue_list, curr_res_num, curr_res_name, curr_atom_list): if Common.debug: print "Adding atoms to residue '%s' %s: %s" % ( \ curr_res_name, curr_res_num, map(lambda x: x.getName(), curr_atom_list)) r = PDB.Residue(number=curr_res_num, name=curr_res_name, atoms=curr_atom_list, chain=None) ## store the chain to the residue accumulation list curr_residue_list.append(r) ## if Common.debug: print "Current residue list is", map(lambda x: (x.getName(), x.getNumber()), curr_residue_list) ## store the back-pointer for each atom to its residue for atom in curr_atom_list: atom.setResidue(r)
def blast_pdb_local(fasta_string, num_hits=1000): import subprocess import os import shlex import StringIO import simtk.openmm.app as app blast_data = os.getenv("DATA_HOME") blast_query = 'blastp -db %s/pdbaa -max_target_seqs %d -outfmt' % ( blast_data, num_hits) out_fmt = '7 qseqid sseqid evalue bitscore' blast_cmd = shlex.split(blast_query) blast_cmd.append(out_fmt) p = subprocess.Popen(blast_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) blast_aln, error = p.communicate(input=fasta_string) msmseeds = [] local_pdb_repo = os.getenv("PDB_HOME") for result in blast_aln.splitlines(): if result[0] != "#": res_data = result.split("\t") e_value = float(res_data[2]) template_chain_code = "_".join(res_data[1].split("|")[3:]) raw_template_pdb = _read_local_repository( local_pdb_repo, template_chain_code.split("_")[0]) template_fasta, pdb_resnums = _retrieve_fasta(template_chain_code) template_pdb = StringIO.StringIO() raw_template_pdbio = StringIO.StringIO(raw_template_pdb) raw_template_pdbio.seek(0) end_resnums = PDB.extract_residues_by_resnum( template_pdb, raw_template_pdbio, pdb_resnums, template_chain_code.split("_")[1]) template_pdb.seek(0) if template_pdb.len == 0: continue template_pdbfile = app.PDBFile(template_pdb) msmseeds.append( MSMSeed(fasta_string, template_fasta, template_pdbfile, e_value)) return msmseeds
def superimposedPoseCluster(self): for prdocking in self.lprdockingpose: prout = self.prout + prdocking.split("/")[-2] + "/" lposes = listdir(prdocking) proutSUperimpose = prout + "Superimposed/" pathFolder.createFolder(proutSUperimpose) for cluster in self.clusters.keys(): pclusterpose = proutSUperimpose + str(cluster) + ".pdb" for compound in self.clusters[cluster]: for pose in lposes: if pose[-3:] != "sdf": continue elif search(compound, pose): pposePDB = runExternalSoft.babelConvertSDFtoPDB( prdocking + pose) cpose = PDB.PDB(PDB_input=pposePDB) cpose.renameAtom() cpose.writePDB(pfilout=pclusterpose, conect=1, model=1)
return mean(v) # end def def find_quartiles(S): mid = median(S) lower = [t for t in S if t < mid] upper = [t for t in S if t > mid] return median(lower), mid, median(upper) # end def root = pred_fname.split('/')[-1] # 1. check if length of pdb file and entropy file match entropy_data = open(entropy_fname).read().strip().split('\n')[1:] entropy_data = [line.split() for line in entropy_data] mdl = PDB(pred_fname) # 1.1 read depth prediction prediction = [mdl.T(i) for i in range(len(mdl)) if mdl.name(i) == 'CA'] # 1.2 read entropies <- # entropies = [float(t[1]) for t in entropy_data] # following 10 lines make up for NA value in entropies (for benchmarking set only <- with unannotated residues) buffer_entropy = [t[1] for t in entropy_data] u = mean_float(buffer_entropy) entropies = [] for t in buffer_entropy: try: entropies.append(float(t)) except ValueError: entropies.append(u) # end try
base = General.getBase(pdb) matchf = args.head + '_' + base + '.match' if not os.path.isfile(matchf): continue outname = General.getBase(matchf) + '.' + args.o if os.path.isfile(outname): continue if outname in seen: continue seen[outname] = 1 # for the gap test only if args.wgap != None: pdb = args.wgap + '/' + mut.dir + '/' + pdb ## pos = PDB.findPositionInPDB(pdb, str(mut.n), mut.c) if pos == -1: print('cannot found the residue in fragment pdb: '+ pdb) continue pdb = General.removePath(pdb) cmd = ['python', selfbin +'/envForMatches.py','--m', matchf, '--n', str(pos-1), '--o', outname] if args.uplimit != None: cmd.extend(['--uplimit', args.uplimit]) if args.wgap != None: cmd.append('--wgap') cmd = ' '.join(cmd) job = General.jobOnCluster([cmd], mut.dir, os.path.realpath(outname)) job.submit(3) jobs.append(job) sleep(0.5)
args = par.parse_args() out = open(args.o, 'w') for line in open(args.l): line = line.strip() path = os.path.basename(os.path.dirname(line)) file = os.path.basename(line) name = General.getBase(file) cid = name.split('_')[1] dbf = path + '/' + name + '.' + args.ext + '.db' if not os.path.isfile(db): continue db = shelve.open(dbf) keys = list(db.keys()) keys_sort = sorted(keys, key = lambda x: int(x.split(',')[1])) for k in keys_sort: dbr = db[k] if (dbr['phi'] > 180.000) or (dbr['phi'] < -180.000) or (dbr['psi'] > 180.000) or (dbr[psi] < -180.000): continue outstr = [PDB.t2s(dbr['aa']), dbr['sumcond'], dbr['crwdnes'], dbr['freedom'], dbr['phi'], dbr['psi']] outstr = '\t'.join(map(str, outstr)) out.write(outstr +'\n') db.close() out.close()
import os, sys, argparse sys.path.insert(0, "/home/grigoryanlab/home/fzheng/modules_py") import General, PDB par = argparse.ArgumentParser() par.add_argument("--l", required=True, help="list file") par.add_argument("--o", required=True, help="output file") par.add_argument("--std", required=True, help="reference values for AAs") args = par.parse_args() std = {} for a in open(args.std): aa, stdv = a.strip().split() std[PDB.t2s(aa)] = float(stdv) out = open(args.o, "w") for line in open(args.l): line = line.strip() path = os.path.basename(os.path.dirname(line)) file = os.path.basename(line) # should look at the env file of the full entry envf = path + "/" + General.getBase(file) + ".stride" if not os.path.isfile(envf): continue env = open(envf) for el in env.readlines(): if not el.startswith("ASG"): continue
for i in range(len(cuts)-1): content.append(seq[cuts[i]:cuts[i+1]]) # end for content.append(seq[cuts[-1]:]) content = '\n'.join(content).strip()+'\n' out = out + content return out # end def # read input fname = sys.argv[1] out_root = sys.argv[2] # get sequence mdl = PDB(fname) try: mdl.write("tmp.pdb") except: pass seq = extract_sequence(mdl) # write output chains = seq.keys() for chain in chains: fasta_lines = fasta_format(seq[chain], out_root+'_'+chain) outfile = out_root+'_'+chain+'.fasta' print outfile fout = open(outfile, 'w') fout.writelines(fasta_lines+'\n')
for j in aas: pairtable[i+'|'+j] = 0 ctable = {x : 0 for x in aas} for l in open(args.l): info = l.strip().split('/') subdir, name = info[-2], info[-1] cfile = General.changeExt(subdir + '/' + name, args.ext) if not os.path.isfile(cfile): continue for ll in open(cfile): # if ll.find('contact') != 0: # continue info2 = ll.strip().split() cond, aa1, aa2 = [info2[args.coln[i]] for i in range(3)] aa1, aa2 = PDB.t2s(aa1), PDB.t2s(aa2) # optional, for sc cond sc_cond = float(info2[-1]) if sc_cond > 0.01: continue if (float(cond) >= args.range[0]) and (float(cond) <= args.range[1]): ctable[aa1] += 1 ctable[aa2] += 1 pairtable[aa1+'|'+aa2] += 1 pairtable[aa2+'|'+aa1] += 1 sortkey = sorted(pairtable.keys()) out = open(args.o, 'w') for sk in sortkey:
dirs = [x for x in os.listdir('.') if os.path.isdir(x)] dirs.sort() odir = os.getcwd() for d in dirs: os.chdir(odir) os.chdir(d) pdbs = glob.glob('*.pdb') if len(pdbs) <= 1: continue pdbid = d.split('_')[0].lower() opdbf = args.sdir + '/' + pdbid + '.clean.pdb' cen = d.split('_')[1] cenres = PDB.getResByInd(opdbf, cen[0], cen[2:]) if cenres.getResname() == 'GLY': carbon = 'CA' else: carbon = 'CB' cencoords = cenres.getAtom(carbon).getCoords() out = d + '.' + args.o outfh = open(out, 'w') cons = [] for pdb in pdbs: con = General.getBase(pdb).split('_')[-1] cons.append([con[0], con[1:]]) for i in range(len(cons)): conres = PDB.getResByInd(opdbf, cons[i][0], cons[i][1]) carbon = 'CB'
for l in open(args.l): info = l.strip().split('/') subdir, name = info[-2], General.getBase(info[-1]) pid, cid = name.split('_') # use the environment file for the whole protein but cmap file for single chain, don't know if this is good envfile = subdir + '/' + name + '.' + args.e # in the case of freedom, will keep the chain name; (if needed will also change for older cmap files) if not (os.path.isfile(envfile)): continue db = shelve.open(envfile + '.db') with open(envfile) as cf: for cfl in cf: if not cfl.startswith('contact'): continue cfa = cfl.strip().split() res1, res2, cond, aa1, aa2 = cfa[1:] if (res1[0] != cid) or (res2[0] != cid): # not relevant chain continue if (not res1 in db) or (not res2 in db): # not this residue in database continue if (not aa1 in PDB.aaa2a) or (not aa2 in PDB.aaa2a): # non-standard amino acid continue if (float(cond) >= args.range[0]) and (float(cond) <= args.range[1]): db1, db2 = db[res1], db[res2] outstr = '\t'.join(map(str, [aaindex[PDB.t2s(aa1)], aaindex[PDB.t2s(aa2)], cond, db1['sumcond'], db1['crwdnes'], db1['freedom'], db2['sumcond'], db2['crwdnes'], db2['freedom']])) fh.write(outstr + '\n') db.close() fh.close()
if cond > args.ccut[0]: if res1 in contacts1: contacts1[res1].append(res2) if res2 in contacts1: contacts1[res2].append(res1) if cond > args.ccut[1]: if res1 in contacts2: contacts2[res1].append(res2) conds[res1][res2] = cond if res2 in contacts2: contacts2[res2].append(res1) conds[res2][res1] = cond if cmapl.startswith('freedom'): cmaplsp = cmapl.strip().split() resid, fd, phi, psi, resname = cmaplsp[1].replace(',', ''), float(cmaplsp[2]), float(cmaplsp[3]), float(cmaplsp[4]), cmaplsp[-1] aaidentity[resid] = PDB.t2s(resname) if resid in freedom: freedom[resid] = fd if resid in phipsi: phipsi[resid] = [phi, psi] dirbb, dirloc, dirtwo = './backbone/', './local/', './twores_nonr/' # read background potentials aatypes = 'A C D E F G H I K L M N P Q R S T V W Y' aatypes = aatypes.split() aaindex = {aatypes[x] : x for x in range(20)} # freedom potential freedom_lines = open(args.em).readlines() freedom_bins = [float(x) for x in freedom_lines[0].strip().split()]
def parseVerbose(vf, mut): # parse the verbose file vfh = open(vf) contacts = {} cons_aa = {} con = None rotamerpairs = [] for vfh_l in vfh: if vfh_l.startswith("crwdnes"): break # looking for contacts involving the central residue if vfh_l.startswith("contact"): if con != None: # first, dump the last contact if len(rotamerpairs) > 0: contacts[con] = rotamerpairs con = None rotamerpairs = [] items = vfh_l.strip().split() if mut.c + "," + str(mut.n) in items: cenfirst = 0 if items[1] == mut.c + "," + str(mut.n): cenfirst = 1 contmp, con_aa = items[1 + cenfirst], items[4 + cenfirst] # only look at contact in the same chain if not contmp.startswith(mut.c): continue else: con = contmp cons_aa[con] = con_aa elif con != None: # reading rotamer pairs under the potential contacts items = vfh_l.strip().split() if items[0 + cenfirst * 3] == con_aa: if items[3 - cenfirst * 3] == PDB.s2t(mut.w): rotamerpairs.append(mut.w + " " + vfh_l.strip().split()[-1]) if items[3 - cenfirst * 3] == PDB.s2t(mut.m): rotamerpairs.append(mut.m + " " + vfh_l.strip().split()[-1]) # the last one can be left if len(rotamerpairs) > 0: contacts[con] = rotamerpairs # now go back to the beginning of file and read the denominator # first look at the denominator of the central residue vfh.seek(0) denominator_cen = [] cen = False for vfh_l in vfh: if vfh_l.startswith("end of rotamer filtering"): break if vfh_l.startswith("position"): if cen == True: break if vfh_l.strip().split()[-1] == mut.c + "," + str(mut.n): cen = True elif vfh_l.startswith(PDB.s2t(mut.w)) or vfh_l.startswith(PDB.s2t(mut.m)): if cen == True: info = ( vfh_l.strip().split(":")[0] + " " + " ".join([x.strip().split()[1] for x in vfh_l.strip().split(":")[1].split(";")[:-1]]) ) denominator_cen.append(info) vfh.seek(0) denominator_cons = {} con = None for vfh_l in vfh: if vfh_l.startswith("end of rotamer filtering"): break if vfh_l.startswith("position"): if vfh_l.strip().split()[-1] in contacts: con = vfh_l.strip().split()[-1] elif con != None: if vfh_l.startswith(cons_aa[con]): info = ( vfh_l.strip().split(":")[0] + " " + " ".join([x.strip().split()[1] for x in vfh_l.strip().split(":")[1].split(";")[:-1]]) ) denominator_cons[con] = info con = None return contacts, denominator_cons, denominator_cen
def computeContacts(mut, contacts, denominator_cons, denominator_cen, outf): # for each contact, compute two separated values for wild type and mutant contacts_keys = [x for x in contacts.keys() if x.split(",")[1].isdigit()] contacts_keys = sorted(contacts_keys, key=lambda x: int(x.split(",")[1])) idx = 1 outfh = open(outf, "w") normal_cmap = args.sdir + "/" + mut.p.lower() + ".clean.cmap" for con in contacts_keys: numerator_w = sum([float(x.split()[1]) for x in contacts[con] if x.split()[0] == mut.w]) numerator_m = sum([float(x.split()[1]) for x in contacts[con] if x.split()[0] == mut.m]) if numerator_w == 0: contactdegree_w = 0 else: denominator_cen_w = [x for x in denominator_cen if x.split()[0] == PDB.s2t(mut.w)][0].split()[1:] denominator_con = denominator_cons[con].split()[1:] denominator_w = 0 for p1, p2 in itertools.product(denominator_cen_w[1:], denominator_con[1:]): denominator_w += float(p1) * float(p2) denominator_w *= float(denominator_cen_w[0]) * float(denominator_con[0]) contactdegree_w = numerator_w / denominator_w if numerator_m == 0: contactdegree_m = 0 else: denominator_cen_m = [x for x in denominator_cen if x.split()[0] == PDB.s2t(mut.m)][0].split()[1:] denominator_con = denominator_cons[con].split()[1:] denominator_m = 0 for p1, p2 in itertools.product(denominator_cen_m[1:], denominator_con[1:]): denominator_m += float(p1) * float(p2) denominator_m *= float(denominator_cen_m[0]) * float(denominator_con[0]) contactdegree_m = numerator_m / denominator_m contactdegree_w, contactdegree_m = format(contactdegree_w, ".4f"), format(contactdegree_m, ".4f") # also add normal contact degree between these two position: normal_cond = Fragment.getConD(normal_cmap, mut.c + "," + str(mut.n), con) normal_cond = format(normal_cond, ".4f") outstring = ( "\t".join( map( str, [ idx, mut.p, mut.c + "," + str(mut.n), con, contactdegree_w, contactdegree_m, normal_cond, denominator_cons[con].split()[0], ], ) ) + "\n" ) idx += 1 outfh.write(outstring) # there could be some contacts which are not detected by two side chains normal_cons, normal_conress = Fragment.contactList(normal_cmap, mut.c, mut.n, dcut=0.01) for k in range(len(normal_cons)): if normal_cons[k] in contacts_keys: # already indexed contact continue else: normal_cond = Fragment.getConD(normal_cmap, mut.c + "," + str(mut.n), normal_cons[k]) normal_cond = format(normal_cond, ".4f") outstring = ( "\t".join( map( str, [ idx, mut.p, mut.c + "," + str(mut.n), normal_cons[k], 0.0000, 0.0000, normal_cond, normal_conress[k], ], ) ) + "\n" ) idx += 1 outfh.write(outstring) # it would be also nice to have permanent contact, but they are rare and probably captured by environment outfh.close()
def ResidueListSequence(residues): import PDB seq = "" for r in residues: seq += PDB.residueLetter(r.resName) return seq
info = l.strip().split('/') subdir, name = info[-2], info[-1] # use the environment file for the whole protein but cmap file for single chain, don't know if this is good envfile = General.changeExt(subdir + '/' + name.split('_')[0], args.e) cmapfile = General.changeExt(subdir + '/' + name, args.c) if not (os.path.isfile(envfile) and os.path.isfile(cmapfile)): continue env = {} with open(envfile) as ef: f_csv = csv.DictReader(ef, delimiter = '\t') for row in f_csv: env[row['residue']] = row['environment_score'] cf = open(cmapfile) for cfl in cf: if not cfl.startswith('contact'): continue cfa = cfl.strip().split() res1, res2, cond, aa1, aa2 = cfa[1:] if (not res1 in env) or (not res2 in env): continue if (float(cond) >= args.range[0]) and (float(cond) <= args.range[1]): outstr = '\t'.join(map(str, [General.getBase(name), cond, PDB.t2s(aa1), PDB.t2s(aa2), env[res1], env[res2]])) + '\n' out.write(outstr)
def __init__(self, *items, **properties): """ :param items: either a sequence of peptide chain objects, or a string, which is interpreted as the name of a database definition for a protein. If that definition does not exist, the string is taken to be the name of a PDB file, from which all peptide chains are constructed and assembled into a protein. :keyword model: one of "all" (all-atom), "no_hydrogens" or "none" (no hydrogens),"polar_hydrogens" or "polar" (united-atom with only polar hydrogens), "polar_charmm" (like "polar", but defining polar hydrogens like in the CHARMM force field), "polar_opls" (like "polar", but defining polar hydrogens like in the latest OPLS force field), "calpha" (only the |C_alpha| atom of each residue). Default is "all". :type model: str :keyword position: the center-of-mass position of the protein :type position: Scientific.Geometry.Vector :keyword name: a name for the protein :type name: str """ if items == (None,): return self.name = '' if len(items) == 1 and type(items[0]) == type(''): try: filename = Database.databasePath(items[0], 'Proteins') found = 1 except IOError: found = 0 if found: blueprint = Database.BlueprintProtein(items[0]) items = blueprint.chains for attr, value in vars(blueprint).items(): if attr not in ['type', 'chains']: setattr(self, attr, value) else: import PDB conf = PDB.PDBConfiguration(items[0]) model = properties.get('model', 'all') items = conf.createPeptideChains(model) molecules = [] for i in items: if ChemicalObjects.isChemicalObject(i): molecules.append(i) else: molecules = molecules + list(i) for m, i in zip(molecules, range(len(molecules))): m._numbers = [i] if not m.name: m.name = 'chain'+`i` ss = self._findSSBridges(molecules) new_mol = {} for m in molecules: new_mol[m] = ([m],[]) for bond in ss: m1 = new_mol[bond[0].topLevelChemicalObject()] m2 = new_mol[bond[1].topLevelChemicalObject()] if m1 == m2: m1[1].append(bond) else: combined = (m1[0] + m2[0], m1[1] + m2[1] + [bond]) for m in combined[0]: new_mol[m] = combined self.molecules = [] while new_mol: m = new_mol.values()[0] for i in m[0]: del new_mol[i] bonds = m[1] if len(m[0]) == 1: m = m[0][0] m._addSSBridges(bonds) else: numbers = sum((i._numbers for i in m[0]), []) m = ConnectedChains(m[0]) m._numbers = numbers m._addSSBridges(bonds) m._finalize() for c in m: c.parent = self m.parent = self self.molecules.append(m) self.atoms = [] self.chains = [] for m in self.molecules: self.atoms.extend(m.atoms) if hasattr(m, 'is_connected_chains'): for c, name, i in zip(range(len(m)), m.chain_names, m._numbers): self.chains.append((m, c, name, i)) else: try: name = m.name except AttributeError: name = '' self.chains.append((m, None, name, m._numbers[0])) self.chains.sort(lambda c1, c2: cmp(c1[3], c2[3])) self.chains = map(lambda c: c[:3], self.chains) self.parent = None self.type = None self.configurations = {} try: self.name = properties['name'] del properties['name'] except KeyError: pass if properties.has_key('position'): self.translateTo(properties['position']) del properties['position'] self.addProperties(properties) undefined = 0 for a in self.atoms: if a.position() is None: undefined += 1 if undefined > 0 and undefined != len(self.atoms): Utility.warning('Some atoms in a protein ' + 'have undefined positions.')
try: solsol_neighbours[sol2].append(sol1) except KeyError: solsol_neighbours.update({sol2:[sol1]}) # end try # end if # end for for key in solsol_neighbours: solsol_neighbours[key] = list(set(solsol_neighbours[key])) # end for solvents = mol_neighbours.keys() residues = sol_neighbours.keys() mdl = PDB(fname) Pred = dict([[str(mdl.resSeq(i)) + ':' + mdl.chainID(i), max(mdl.T()[i], 0)] for i in range(len(mdl))]) def separate_by_chains(S): keys = list(set([res.split(':')[-1] for res in S])) chains = dict([ [key, []] for key in keys ]) for res in S: key = res.split(':')[-1] chains[key].append(res) # end for return chains # end def # Assign primary probability measure to solvent Prob_sol = {} for solvent in solvents:
cid, resnum = args.resid for seqf in seqfs: pdbf = General.changeExt( seqf.replace(args.head + '_', ''), 'pdb') if not os.path.isfile(pdbf): print(pdbf + ' doesn\'t exist!') continue outf = General.changeExt(pdbf, args.o) if args.wgap != None: # specific to gap assert args.conR == False, 'wgap and conR cannot be specified simultaneously' dirname = General.getBase(pdbf) pdbf = args.wgap + '/' + dirname + '/'+ pdbf index = PDB.findPositionInPDB(pdbf, resnum, cid) aacol = Analyze.readColumn(seqf, index, top = args.uplimit) if args.conR: # should contacting residue be constrained? conid = General.getBase(seqf).split('_')[-1] ccid, cresnum = conid[0], conid[1:] cindex = PDB.findPositionInPDB(pdbf, cresnum, ccid) cres = PDB.getResByInd(pdbf, ccid, cresnum).getResname() cres = PDB.t2s(cres) caacol = Analyze.readColumn(seqf, cindex, top = args.uplimit) if args.env != None: # environment corrected counts envf = General.getBase(seqf.replace(args.head, args.envhead)) + '.' + args.env if not os.path.isfile(envf): print(envf + ' doesn\'t exist!') continue
dirs = [x for x in os.listdir('.') if os.path.isdir(x)] dirs.sort() odir = os.getcwd() for d in dirs: os.chdir(odir) os.chdir(d) pdbs = glob.glob('*.pdb') cmds = [] resn = int(d.split('_')[1][2:]) for pdb in pdbs: matchf = args.head + '_' + General.changeExt(pdb, 'match') if not os.path.isfile(matchf): continue pos = PDB.findPositionInPDB(pdb, resn) # if output file is already there, skip the job if os.path.isfile('nr'+args.id +'_'+matchf): continue cmd = ['python', selfbin + '/removeLocalRedundancy.py', '--m', matchf, '--cres', str(pos), '--id', args.id, '--outh', 'nr'+args.id] if not args.db == None: cmd.extend(['--db', args.db]) if args.conR: conresn = General.getBase(pdb).split('_')[2][1:] conpos = PDB.findPositionInPDB(pdb, conresn) cmd.extend(['--conres', str(conpos)]) if args.env != None: cmd.extend(['--env', args.env]) cmd = ' '.join(cmd) cmds.append(cmd)
W_e = p_self_m - p_self_w W_pp = p_pp_m - p_pp_w # select correct file for backbone term if using pp potential! # predicting contact potential cpf = '/'.join([args.conpot, mut.dir, mut.dir +'.cons']) assert os.path.isfile(cpf), cpf cpls = open(cpf).read().splitlines() # current all contacts above 0.01 (everything searched) are considered (but this has not considered the side chains, which may cause problems) cp_inds = [ x.strip().split() for x in cpls if float(x.split()[4]) > 0.01 ] ddG_cp = 0.0 if len(cp_inds) != 0: for i in range(len(cp_inds)): con_c, con_n = cp_inds[i][3].split(',') cres = PDB.t2s(cp_inds[i][-1]) cond = float(cp_inds[i][4]) cdbin = mustpress.determineBin(condbins, cond) conpots_file = conpots[priR_list[cdbin]] ddG_cpi = conpots_file[aaindex[mut.w], aaindex[cres]] - conpots_file[aaindex[mut.m], aaindex[cres]] ddG_cp -= ddG_cpi # predicting backbone related terms consf = '/'.join([args.bb[0], mut.dir, mut.dir + '.cons']) assert os.path.isfile(consf), consf conls = open(consf).read().splitlines() conds_inds = [ [float(x.split()[4]), x.split()[3]] for x in conls if float(x.split()[4]) > args.cutbb] # contact degrees, and contact residue number # conds_inds = [x for x in conds_inds if abs(int(x[1].split(',')[1]) - mut.n) > 5 ] for i in range(len(conds_inds)): countf = '/'.join([args.bb[0], mut.dir, mut.dir + '_' + conds_inds[i][1].replace(',', '') + '.' + args.bb[1]]) if os.path.isfile(countf):
import os, sys import General, PDB if len(sys.argv) - 1 != 2: print '<usage> [list pdb file] [output .fa file]' exit(0) lst, fasta = sys.argv[1:] out = open(fasta, 'w') for l in open(lst): pdbf = l.strip() name = General.removePath(pdbf) seqs = PDB.pdb2seq(pdbf) out.write('>'+pdbf+'\n') for c in seqs: # because only single chain out.write(seqs[c]+'\n')
# This is part of DEPTH. # DEPTH (Version: 2.0) computes the closest distance of a residue/atom to bulk solvent and predicts small molecule binding site of a protein. # Copyright (C) 2013, Kuan Pern Tan, Nguyen Thanh Binh, Raghavan Varadarajan and M.S. Madhusudhan # # DEPTH is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. # DEPTH is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public License along with DEPTH. If not, see <http://www.gnu.org/licenses/>. import sys from PDB import * fname = sys.argv[1] mdl = PDB(fname) binding_residues = [] for i in range(len(mdl)): if mdl.T(i) == 1: res = str(mdl.resSeq(i))+':'+mdl.chainID(i) if res not in binding_residues: binding_residues.append(res) # end if # end if # end for out = ' '.join(binding_residues) print out
# end for content.append(seq[cuts[-1]:]) content = '\n'.join(content).strip() + '\n' out = out + content return out # end def # read input fname = sys.argv[1] out_root = sys.argv[2] # get sequence mdl = PDB(fname) try: mdl.write("tmp.pdb") except: pass seq = extract_sequence(mdl) # write output chains = seq.keys() for chain in chains: fasta_lines = fasta_format(seq[chain], out_root + '_' + chain) outfile = out_root + '_' + chain + '.fasta' print outfile fout = open(outfile, 'w') fout.writelines(fasta_lines + '\n')
par.add_argument('--sl', help = 'use a searchDB list file') par.add_argument('--o', required = True, help = 'name of the output file') args = par.parse_args() out = open(args.o, 'w') def outputSeq(seqs, name, out, chains = None): if (chains != None) and (not isinstance(chains, list)): chains = list(chains) keys = seqs.keys() keys.sort() for k in keys: if (chains != None) and (k not in chains): continue out.write('>' + name + '_' + k + '\n') out.write(seqs[k]+'\n') if args.sl == None: for l in open(args.pl): pid, cid = l.strip().split('_') p = pid.lower() + '.clean.pdb' seqs = PDB.pdb2seq(p) outputSeq(seqs, pid.lower(), out, cid) else: for l in open(args.sl): p = General.changeExt(l.rstrip('\n'), 'pdb') seqs = PDB.pdb2seq(p) name = General.removePath(p).split('.')[0] outputSeq(seqs, name, out) out.close()
# start and end residue for the peptide chain pepstart = p.numResidues() + 1 pepend = pepstart + pepchain.numResidues() -1 # add peptide backbone from template to receptor rec = p + pepchain.copy() writePDB('_start.pdb', rec) if args.flip: # flip the structure os.system('perl -w '+ SELFBIN + '/flipPeptideChirality.pl _start.pdb _startf.pdb 0') # mutate the chirality of the residues on the domain domainseq = [] for dres in p.iterResidues(): dresname = PDB.s2t(PDB.t2s(dres.getResname())) if dresname != 'GLY': domainseq.append('D'+ dresname) else: domainseq.append(dresname) pdz.normalMut('_startf.pdb', range(1, pepstart), domainseq, '_startf1.pdb') # mutate the residue on peptide pepseq = [] for pres in args.pseq: if len(pres) == 4: # if should be a D-residue, make the name to be a L-residue (since the domain has been flipped) pepseq.append(pres[1:]) if len(pres) == 3: if pres != 'GLY': pepseq.append('D' + pres) else:
weights = np.array(args.weights) outf.write('#weights:' + '\t' + '\t'.join([str(x) for x in weights])+'\n') aatypes = 'A C D E F G H I K L M N P Q R S T V W Y' outf.write(aatypes + '\n') for i in range(1, len(residues)-1): res = residues[i] resid = res.getChid() + str(res.getResnum()) scf = pid + '_' + resid + '.' + args.ext if not os.path.isfile(scf): continue wsc_allaa = [] with open(scf) as sf: lines = sf.readlines() for l in lines: lsp = l.strip().split() assert len(lsp) == len(weights) sc = lsp[1:] sc.insert(0, 1.0) sc = np.array(sc, dtype = 'float') weighted_sc = (sc * weights).sum() wsc_allaa.append(weighted_sc) p_aa = np.exp(-np.array(wsc_allaa)/args.t) p_aa = p_aa / p_aa.sum() outf.write(str(res.getResnum()) + ' ' + PDB.t2s(res.getResname()) + ' ' + ' '.join([format(x, '.3f') for x in p_aa]) + '\n') outf.close()
par.add_argument('--l', required = True, help = 'a list of pdb files') par.add_argument('--o', required = True, help = 'an output file') par.add_argument('--multi', action = 'store_true', help = 'if true, multi-chain is allowed and output') args = par.parse_args() ofh = open(args.o, 'w') for l in open(args.l).readlines(): pdbf = l.strip() mol = parsePDB(pdbf) nchains = mol.numChains() if not args.multi: if nchains > 1: print 'Warning: ' + pdbf + ' has more than 1 chains' continue seqs = {} for res in mol.iterResidues(): cid, resname = res.getChid(), res.getResname() if not cid in seqs: seqs[cid] = '' if not resname in PDB.aaa2a: seqs[cid] += 'X' else: seqs[cid] += PDB.t2s(resname) keys = seqs.keys() keys.sort() for k in keys: ofh.write('>' + pdbf + '|' + k + '|' + str(len(seqs[k])) + '\n') ofh.write(seqs[k] + '\n')
pdbs = glob.glob('*.pdb') pdbs.sort() for pdb in pdbs: base = General.getBase(pdb) matchf = args.head + '_' + base + '.match' if not os.path.isfile(matchf): continue outname = General.getBase(matchf) + '.' + args.o if os.path.isfile(outname): continue if outname in seen: continue seen[outname] = 1 pos = PDB.findPositionInPDB(pdb, str(mut.n), mut.c) if pos == -1: print('cannot found the residue in fragment pdb: '+ pdb) continue con = base.split('_')[-1] conc, conn = con[0], con[1:] conpos = PDB.findPositionInPDB(pdb, str(conn), conc) cmd = ['python', selfbin +'/envForMatches_pair.py','--m', matchf, '--n', str(pos-1), str(conpos-1), '--o', outname] if args.uplimit != None: cmd.extend(['--uplimit', args.uplimit]) cmd = ' '.join(cmd) job = General.jobOnCluster([cmd], mut.dir, os.path.realpath(outname)) job.submit(3)