def countingSubstituent (name_final, debug = 1): pr_final_folder = pathManage.result("final_" + name_final) d_count = {} d_lig = {} d_by_ref = {} d_count_pr = {} l_file_final = listdir(pr_final_folder) if debug : print "1", pr_final_folder for pr_type_subref in l_file_final : # case where pr type is a file not a folder try : l_pr_sub = listdir(pr_final_folder + pr_type_subref + "/") except : continue if debug: print "2",pr_final_folder + pr_type_subref + "/" # case cycle append one directory if "cycle" in l_pr_sub : l_pr_sub.remove ("cycle") l_second_sub = listdir (pr_final_folder + pr_type_subref + "/cycle/") for second_sub in l_second_sub : l_pr_sub.append ("cycle/" + second_sub) for pr_sub in l_pr_sub : # case where pr_type_substituent is a folder try : l_pr_PDBref = listdir(pr_final_folder + pr_type_subref + "/" + pr_sub + "/") except : continue if debug : print "3", pr_final_folder + pr_type_subref, pr_sub for pr_PDBref in l_pr_PDBref : PDB_ref = pr_PDBref.split ("_")[-1] family_ref = pr_PDBref.split ("-")[0] group_ref = pr_PDBref.split ("_")[0].split ("-")[-1] pr_LGD = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LGD/" pr_LSR = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LSR/" pr_BS = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/BS/" if debug : print "4",pr_LGD print "4", pr_BS print "4", pr_LSR ################ # folder LSR # ################ l_file_LSR = listdir (pr_LSR) for file_LSR in l_file_LSR : # -> count by type sub reference if search ("LSR_", file_LSR) and file_LSR.split ("_")[1] != "REF" : ligand_sub = file_LSR.split ("_")[1] if debug : print "5", file_LSR if not ligand_sub in d_count.keys () : d_count[ligand_sub] = {} if not pr_sub in d_count[ligand_sub].keys () : d_count[ligand_sub][pr_sub] = 0 d_count[ligand_sub][pr_sub] = d_count[ligand_sub][pr_sub] + 1 ################ # complet LSR # ################ elif search ("LSR", file_LSR): # case LSR reference # ###################### if search ("REF_", file_LSR) : lig_ref = file_LSR.split ("_")[2][:3] if not lig_ref in d_by_ref.keys () : d_by_ref[lig_ref] = {} type_ref = pr_type_subref.split ("_")[0] if not type_ref in d_by_ref[lig_ref].keys () : d_by_ref[lig_ref][type_ref] = 0 d_by_ref[lig_ref][type_ref] = d_by_ref[lig_ref][type_ref] + 1 ################# # folder LGD # ################# l_file_LGD = listdir(pr_LGD) for file_LGD in l_file_LGD : # print file_ref if search ("LGD", file_LGD): ligand = file_LGD.split ("_")[1] if ligand == "REF" : continue if not ligand in d_lig.keys () : d_lig[ligand] = {} d_lig[ligand]["count"] = 0 d_lig[ligand]["group"] = [] d_lig[ligand]["family"] = [] d_lig[ligand]["count"] = d_lig[ligand]["count"] + 1 d_lig[ligand]["family"].append (str(family_ref)) d_lig[ligand]["group"].append (str(group_ref)) ############### # folder BS # ############### l_file_BS = listdir(pr_BS) for file_BS in l_file_BS : if search ("BS_REF", file_BS): lig_ref = file_BS.split ("_")[2] pr_ref = file_BS.split ("_")[3].split (".")[0] print lig_ref, pr_ref, "*****" if not lig_ref in d_count_pr.keys () : d_count_pr[lig_ref] = {} d_count_pr[lig_ref]["pr ref"] = [] d_count_pr[lig_ref]["pr queries"] = [] d_count_pr[lig_ref]["lig queries"] = [] if not pr_ref in d_count_pr[lig_ref]["pr ref"] : d_count_pr[lig_ref]["pr ref"].append (pr_ref) try: family = analysis.findFamily (pr_ref, pathManage.dataset (lig_ref) + "family_PDB.txt") if not family in d_count_pr[lig_ref].keys () : d_count_pr[lig_ref][family] = 0 d_count_pr[lig_ref][family] = d_count_pr[lig_ref][family] + 1 except: pass # BS -> query for file_BS in l_file_BS : # for not reference BS if not search ("BS_REF", file_BS) : lig_querie = file_BS.split ("_")[1] prot_querie = file_BS.split ("_")[2][0:4] print prot_querie, lig_querie, "*******" # find ligand reference # lig ref define in previous step d_count_pr[lig_ref]["pr queries"].append (prot_querie) d_count_pr[lig_ref]["lig queries"].append (lig_querie) # write and plot # ################## pr_result = pathManage.generatePath(pr_final_folder + "counting/") for ligand_sub in d_count.keys () : p_filout = pr_result + ligand_sub filout = open (p_filout, "w") filout.write ("\t".join(d_count[ligand_sub].keys ()) + "\n") l_value = [str(x) for x in d_count[ligand_sub].values ()] filout.write ("\t".join(l_value) + "\n") filout.close () runOtherSoft.piePlot(p_filout) filout_lig = open (pr_result + "count_ligand", "w") filout_lig.write ("Ligand ID\tNumber of occurences in the dataset\tNumber of different clusters\tList of clusters\tList of protein families\n") for lig in d_lig.keys () : if d_lig[lig] > 1 : filout_lig.write (str (lig) + "\t" + str (d_lig[lig]["count"]) + "\t" + str(len (list (set(d_lig[lig]["group"])))) + "\t" + " ".join (d_lig[lig]["group"]) + "\t" + " ".join (d_lig[lig]["family"]) + "\n") filout_lig.close () filout_LSR_lig = open (pr_result + "CountByLigandRef", "w") for lig_ref in d_by_ref.keys () : filout_LSR_lig.write ("====" + str (lig_ref) + "====\n") for sub_ref in d_by_ref[lig_ref].keys () : filout_LSR_lig.write (str (sub_ref) + ": " + str (d_by_ref[lig_ref][sub_ref]) + "\n") filout_LSR_lig.close () filout_pr_count = open (pr_result + "count_pr", "w") for lig in d_count_pr.keys () : filout_pr_count.write ("====" + str (lig) + "====\n") filout_pr_count.write ("nb ref pr: " + str (len (d_count_pr[lig]["pr ref"])) + "\n") filout_pr_count.write ("nb querie pr: " + str (len (d_count_pr[lig]["pr queries"])) + "\n") filout_pr_count.write ("nb ligand queries: " + str (len (d_count_pr[lig]["lig queries"])) + "\n") for family in d_count_pr[lig].keys () : if family != "pr ref" and family != "pr queries" and family != "lig queries" : filout_pr_count.write ("Ref " + str (family) + ": " + str (d_count_pr[lig][family]) + "\n") filout_pr_count.close () runOtherSoft.barplot(pr_result + "count_ligand")
def qualityExtraction (l_ligand, name_folder, p_list_ligand, thresold_sheap) : pr_result = pathManage.result("final_" + name_folder) filout = open(pr_result + "quality_extraction.txt", "w") # number PDB by ligand, without filter filout.write ("Number PDB by ligand:\n") d_dataset = tool.parseLigandPDBList(p_list_ligand) for ligand in l_ligand : filout.write (str (ligand) + ": " + str (len (d_dataset[ligand])) + "\n") # number references filout.write ("\n*************\n\nNumber references by ligands:\n") for ligand in l_ligand : pr_result_ligand = pathManage.result(ligand) nb_ref = -2 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) : nb_ref = nb_ref + 1 filout.write (ligand + ": " + str (nb_ref) + "\n") # number of query by ref in means and max and min (after blast) filout.write ("\n*************\n\nNumber means queries by references:\n") p_family_all = pathManage.result() + "reference_family_all.txt" filout_family_all = open (p_family_all, "w") d_family_all = {} for ligand in l_ligand : d_nb_query = {} d_family = {} p_filout_family = pathManage.result() + "reference_family_" + ligand + ".txt" p_filout_family_count = pathManage.result () + "count_family_" + ligand + ".txt" filout_family = open (p_filout_family, "w") filout_family_count = open (p_filout_family_count, "w") pr_result_ligand = pathManage.result(ligand) nb_ref = 0 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) and len (f) == 4: # count by family family_ref = analysis.findFamily(f, pathManage.findFamilyFile (ligand)) filout_family.write ("\t".join (family_ref) + "\n") if not family_ref[-1] in d_family.keys () : d_family[family_ref[-1]] = 0 d_family[family_ref[-1]] = d_family[family_ref[-1]] + 1 # file all if not family_ref[-1] in d_family_all.keys () : d_family_all[family_ref[-1]] = 0 d_family_all[family_ref[-1]] = d_family_all[family_ref[-1]] + 1 # count number of references nb_ref = nb_ref + 1 d_nb_query[f] = 0 l_file_queries = listdir(pr_result_ligand + "/" + f + "/") for file_query in l_file_queries : if search ("CX",file_query) : d_nb_query[f] = d_nb_query[f] + 1 filout.write (ligand + ": " + str(np.sum(d_nb_query.values ())) + "\n") filout.write (ligand + ": " + str(np.mean(d_nb_query.values ())) + "+/-" + str(np.std (d_nb_query.values ())) + "\n") filout.write ("MAX " + str (ligand) + ": " + str (max (d_nb_query.values ())) + " " + str (d_nb_query.keys ()[d_nb_query.values ().index (max (d_nb_query.values ()))]) +"\n") # family filout_family_count.write ("\t".join(d_family.keys ()) + "\n") l_values = [str(x) for x in d_family.values ()] filout_family_count.write ("\t".join(l_values) + "\n") filout_family.close () filout_family_count.close () runOtherSoft.piePlot(p_filout_family_count) # all family filout_family_all.write ("\t".join(d_family_all.keys ()) + "\n") l_values = [str(x) for x in d_family_all.values ()] filout_family_all.write ("\t".join(l_values) + "\n") filout_family_all.close () runOtherSoft.piePlot(p_family_all) # number subref by ligand filout.write ("\n*************\n\nNumber of subref considered:\n") for ligand in l_ligand : d_nb_sub = {} d_nb_sub_sheap = {} pr_result_ligand = pathManage.result(ligand) l_ref = listdir(pr_result_ligand) for ref in l_ref : if path.isdir (pr_result_ligand + "/" + ref) and len (ref) == 4: l_file_queries = listdir(pr_result_ligand + "/" + ref + "/") for file_query in l_file_queries : if search ("substituent",file_query) and search (".pdb",file_query): atom_substituate = file_query.split ("_")[-2] try : value_sheap = float(file_query.split ("_")[-1][:-4]) except : continue if not atom_substituate in d_nb_sub.keys () : d_nb_sub[atom_substituate] = 0 d_nb_sub[atom_substituate] = d_nb_sub[atom_substituate] + 1 if value_sheap > thresold_sheap : if not atom_substituate in d_nb_sub_sheap : d_nb_sub_sheap[atom_substituate] = 0 d_nb_sub_sheap[atom_substituate] = d_nb_sub_sheap[atom_substituate] + 1 filout.write ("\n" + ligand + "\n") for atom_substituate in d_nb_sub.keys () : filout.write (atom_substituate + ": " + str (d_nb_sub[atom_substituate]) + "\n") try : filout.write (atom_substituate + " ShaEP: " + str (d_nb_sub_sheap[atom_substituate]) + "\n") except : filout.write (atom_substituate + " ShaEP: 0\n") filout.close()