def extractReference(p_list_ligand, p_dir_dataset, p_dir_result, substruct): # struct reference d_dataset = {} # retrieve list of ligand in PDB d_ligand = tool.parseLigandPDBList(p_list_ligand) # download PDB and fasta associated l_p_PDB = [] l_p_fasta = [] for PDB_ID in d_ligand[substruct]: PDB_ID = PDB_ID.upper() p_pdb = downloadFile.importPDB(PDB_ID, p_dir_dataset, dir_by_PDB=1, debug=1, dbPDB="/home/borrel/PDB/") p_fasta = downloadFile.importFasta( PDB_ID, p_dir_dataset, dir_by_PDB=1, debug=1, fastaGlobal="/home/borrel/Yue_project/pdb_seqres.txt") if p_pdb != 0 and p_fasta != 0: l_p_pdb_chain = separeByChain(p_pdb) l_p_fasta_chain = separeChainFasta(p_fasta) d_dataset[PDB_ID] = {} d_dataset[PDB_ID]["p_pdb"] = p_pdb d_dataset[PDB_ID]["p_fasta"] = p_fasta d_dataset[PDB_ID]["p_pdb_chain"] = l_p_pdb_chain d_dataset[PDB_ID]["p_fasta_chain"] = l_p_fasta_chain d_dataset[PDB_ID]["conserve"] = 1 # plot resolution p_file_RX = p_dir_result + "resolution_ref.txt" file_RX = open(p_file_RX, "w") for PDB_ID in d_dataset.keys(): RX = parsePDB.resolution(d_dataset[PDB_ID]["p_pdb"]) try: d_dataset[PDB_ID]["RX"] = float(RX) except: d_dataset[PDB_ID]["RX"] = 100.0 file_RX.write(PDB_ID + "\t" + str(RX) + "\n") file_RX.close() runOtherSoft.Rhistogram(p_file_RX, "RX_ref_no_filter") return d_dataset
def extractReference (p_list_ligand, p_dir_dataset, p_dir_result, substruct): # struct reference d_dataset = {} # retrieve list of ligand in PDB d_ligand = tool.parseLigandPDBList (p_list_ligand) # download PDB and fasta associated l_p_PDB = [] l_p_fasta = [] for PDB_ID in d_ligand[substruct] : PDB_ID = PDB_ID.upper() p_pdb = downloadFile.importPDB(PDB_ID, p_dir_dataset, dir_by_PDB = 1, debug = 1, dbPDB = "/home/borrel/PDB/" ) p_fasta = downloadFile.importFasta(PDB_ID, p_dir_dataset, dir_by_PDB = 1, debug = 1, fastaGlobal = "/home/borrel/Yue_project/pdb_seqres.txt") if p_pdb != 0 and p_fasta != 0 : l_p_pdb_chain = separeByChain (p_pdb) l_p_fasta_chain = separeChainFasta(p_fasta) d_dataset[PDB_ID] = {} d_dataset[PDB_ID] ["p_pdb"] = p_pdb d_dataset[PDB_ID] ["p_fasta"] = p_fasta d_dataset[PDB_ID] ["p_pdb_chain"] = l_p_pdb_chain d_dataset[PDB_ID] ["p_fasta_chain"] = l_p_fasta_chain d_dataset[PDB_ID] ["conserve"] = 1 # plot resolution p_file_RX = p_dir_result + "resolution_ref.txt" file_RX = open (p_file_RX, "w") for PDB_ID in d_dataset.keys () : RX = parsePDB.resolution(d_dataset[PDB_ID]["p_pdb"]) try : d_dataset[PDB_ID] ["RX"] = float(RX) except : d_dataset[PDB_ID] ["RX"] = 100.0 file_RX.write (PDB_ID + "\t" + str (RX) + "\n") file_RX.close () runOtherSoft.Rhistogram (p_file_RX, "RX_ref_no_filter") return d_dataset
def qualityExtraction (l_ligand, name_folder, p_list_ligand, thresold_sheap) : pr_result = pathManage.result("final_" + name_folder) filout = open(pr_result + "quality_extraction.txt", "w") # number PDB by ligand, without filter filout.write ("Number PDB by ligand:\n") d_dataset = tool.parseLigandPDBList(p_list_ligand) for ligand in l_ligand : filout.write (str (ligand) + ": " + str (len (d_dataset[ligand])) + "\n") # number references filout.write ("\n*************\n\nNumber references by ligands:\n") for ligand in l_ligand : pr_result_ligand = pathManage.result(ligand) nb_ref = -2 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) : nb_ref = nb_ref + 1 filout.write (ligand + ": " + str (nb_ref) + "\n") # number of query by ref in means and max and min (after blast) filout.write ("\n*************\n\nNumber means queries by references:\n") p_family_all = pathManage.result() + "reference_family_all.txt" filout_family_all = open (p_family_all, "w") d_family_all = {} for ligand in l_ligand : d_nb_query = {} d_family = {} p_filout_family = pathManage.result() + "reference_family_" + ligand + ".txt" p_filout_family_count = pathManage.result () + "count_family_" + ligand + ".txt" filout_family = open (p_filout_family, "w") filout_family_count = open (p_filout_family_count, "w") pr_result_ligand = pathManage.result(ligand) nb_ref = 0 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) and len (f) == 4: # count by family family_ref = analysis.findFamily(f, pathManage.findFamilyFile (ligand)) filout_family.write ("\t".join (family_ref) + "\n") if not family_ref[-1] in d_family.keys () : d_family[family_ref[-1]] = 0 d_family[family_ref[-1]] = d_family[family_ref[-1]] + 1 # file all if not family_ref[-1] in d_family_all.keys () : d_family_all[family_ref[-1]] = 0 d_family_all[family_ref[-1]] = d_family_all[family_ref[-1]] + 1 # count number of references nb_ref = nb_ref + 1 d_nb_query[f] = 0 l_file_queries = listdir(pr_result_ligand + "/" + f + "/") for file_query in l_file_queries : if search ("CX",file_query) : d_nb_query[f] = d_nb_query[f] + 1 filout.write (ligand + ": " + str(np.sum(d_nb_query.values ())) + "\n") filout.write (ligand + ": " + str(np.mean(d_nb_query.values ())) + "+/-" + str(np.std (d_nb_query.values ())) + "\n") filout.write ("MAX " + str (ligand) + ": " + str (max (d_nb_query.values ())) + " " + str (d_nb_query.keys ()[d_nb_query.values ().index (max (d_nb_query.values ()))]) +"\n") # family filout_family_count.write ("\t".join(d_family.keys ()) + "\n") l_values = [str(x) for x in d_family.values ()] filout_family_count.write ("\t".join(l_values) + "\n") filout_family.close () filout_family_count.close () runOtherSoft.piePlot(p_filout_family_count) # all family filout_family_all.write ("\t".join(d_family_all.keys ()) + "\n") l_values = [str(x) for x in d_family_all.values ()] filout_family_all.write ("\t".join(l_values) + "\n") filout_family_all.close () runOtherSoft.piePlot(p_family_all) # number subref by ligand filout.write ("\n*************\n\nNumber of subref considered:\n") for ligand in l_ligand : d_nb_sub = {} d_nb_sub_sheap = {} pr_result_ligand = pathManage.result(ligand) l_ref = listdir(pr_result_ligand) for ref in l_ref : if path.isdir (pr_result_ligand + "/" + ref) and len (ref) == 4: l_file_queries = listdir(pr_result_ligand + "/" + ref + "/") for file_query in l_file_queries : if search ("substituent",file_query) and search (".pdb",file_query): atom_substituate = file_query.split ("_")[-2] try : value_sheap = float(file_query.split ("_")[-1][:-4]) except : continue if not atom_substituate in d_nb_sub.keys () : d_nb_sub[atom_substituate] = 0 d_nb_sub[atom_substituate] = d_nb_sub[atom_substituate] + 1 if value_sheap > thresold_sheap : if not atom_substituate in d_nb_sub_sheap : d_nb_sub_sheap[atom_substituate] = 0 d_nb_sub_sheap[atom_substituate] = d_nb_sub_sheap[atom_substituate] + 1 filout.write ("\n" + ligand + "\n") for atom_substituate in d_nb_sub.keys () : filout.write (atom_substituate + ": " + str (d_nb_sub[atom_substituate]) + "\n") try : filout.write (atom_substituate + " ShaEP: " + str (d_nb_sub_sheap[atom_substituate]) + "\n") except : filout.write (atom_substituate + " ShaEP: 0\n") filout.close()