def controlResult (l_name_ligand): filout = open(pathManage.result() + "sheap_control.txt", "w") for name_ligand in l_name_ligand : count_sheap = 0 count_sheap_out = 0 count_ribose = 0 pr_result = pathManage.result(name_ligand) l_ref = listdir(pr_result) for ref_PDB in l_ref : if len(ref_PDB) == 4 : print ref_PDB pr_ref = pr_result + ref_PDB l_file = listdir(pr_ref) for file_ref in l_file : if search(".hit", file_ref) : count_sheap = count_sheap + 1 if path.getsize(pr_ref +"/" + file_ref ) < 100 : count_sheap_out = count_sheap_out + 1 if search("ribose", file_ref) : count_ribose = count_ribose + 1 filout.write (name_ligand + "\n") filout.write ("count Shaep:" + str (count_sheap) + "\n") filout.write ("count Shaep wrong:" + str (count_sheap_out) + "\n") filout.write ("count Shaep ribose:" + str (count_ribose) + "\n") filout.write ("******************\n")
def builtDatasetGlobal(p_list_ligand, ligand_ID, thresold_RX=2.5, thresold_blast=1e-4, verbose=1): # directory with dataset p_dir_dataset = pathManage.dataset(ligand_ID) # directory with result p_dir_result = pathManage.result(ligand_ID + "/datasetBuilding") # first extract reference d_dataset = extractReference(p_list_ligand, p_dir_dataset, p_dir_result, ligand_ID) # file with name and family analysis.familyPDBRef(d_dataset, p_dir_dataset + "family_PDB.txt") if verbose: toolViewStructDataset(d_dataset) # select reference # remove RX and same chain p_dir_align = pathManage.result(ligand_ID + "/datasetBuilding/aligmentRef") filterReferenceByOne(d_dataset, p_dir_align, ligand_ID, thresold_RX=thresold_RX) if verbose: toolViewStructDataset(d_dataset) # conserve only unique protein filterGlobalDataset(d_dataset, p_dir_align) if verbose: toolViewStructDataset(d_dataset) # run blast by sequence conserved p_dir_blast = pathManage.result(ligand_ID + "/datasetBuilding/blast") RunBlast.globalRun(d_dataset, p_dir_blast) if verbose: toolViewStructDataset(d_dataset) # filter by e-value and RX filterBlastResult(d_dataset, p_dir_dataset, ligand_ID, thresold_RX=thresold_RX, thresold_blast=thresold_blast) if verbose: toolViewStructDataset(d_dataset) # clean folder dataset cleanFolderDataset(d_dataset, p_dir_dataset)
def ionIdentification(name_ligand): """ step 4 search in the close environment if metal is here compute distance and angles """ # in folder p_dir_dataset = pathManage.dataset(name_ligand) p_filout = pathManage.result(name_ligand) + "ionsAnalysis.txt" ionSearch.analyseIons(p_dir_dataset, name_ligand, p_filout)
def classifRefProtein(pr_dataset, l_lig, thresold_identity=30.0, thresold_similarity=30.0): pr_out = pathManage.result("clasifRef") # case fasta file pr_align_seq = pathManage.generatePath(pr_out + "alignSeq/") l_p_fasta = [] for lig in l_lig: pr_dataset = pathManage.dataset(lig) l_file_by_lig = listdir(pr_dataset) l_pr_ref_by_lig = [pr_dataset + x for x in l_file_by_lig] for pr_ref_by_lig in l_pr_ref_by_lig: PDB_folder = pr_ref_by_lig.split("/")[-1] try: l_file = listdir(pr_ref_by_lig) except: continue for file_ref in l_file: if search("^" + PDB_folder, file_ref): PDB_ID = file_ref[0:-4] PDB_ID = PDB_ID[0:4].lower() + PDB_ID[4:] # PDB ID with chain associated p_fasta = downloadFile.importFasta( PDB_ID, pr_align_seq, dir_by_PDB=0, debug=1, fastaGlobal="/home/borrel/Yue_project/pdb_seqres.txt") l_p_fasta.append(p_fasta) break d_outNeedle = applyNeedleList(l_p_fasta, pr_align_seq) # writeMatrix writeMatrixFromDico(d_outNeedle, pr_out + "matrixSimilarSeq", "similarity") writeMatrixFromDico(d_outNeedle, pr_out + "matrixIDSeq", "identity") #Group reference -> l 209 p_group_id = GroupRef( d_outNeedle, "identity", pr_out + "groupIdentity" + "_" + str(thresold_identity) + ".txt", thresold_identity, l_lig) p_group_sim = GroupRef( d_outNeedle, "similarity", pr_out + "groupSimilarity" + "_" + str(thresold_similarity) + ".txt", thresold_similarity, l_lig) # merge not alone prot MergeGroup(p_group_id) MergeGroup(p_group_sim)
def ionIdentification (name_ligand): """ step 4 search in the close environment if metal is here compute distance and angles """ # in folder p_dir_dataset = pathManage.dataset(name_ligand) p_filout = pathManage.result(name_ligand) + "ionsAnalysis.txt" ionSearch.analyseIons (p_dir_dataset, name_ligand, p_filout)
def builtDatasetGlobal (p_list_ligand, ligand_ID, thresold_RX = 2.5, thresold_blast = 1e-4, verbose = 1 ): # directory with dataset p_dir_dataset = pathManage.dataset(ligand_ID) # directory with result p_dir_result = pathManage.result(ligand_ID + "/datasetBuilding") # first extract reference d_dataset = extractReference (p_list_ligand, p_dir_dataset, p_dir_result, ligand_ID) # file with name and family analysis.familyPDBRef (d_dataset, p_dir_dataset + "family_PDB.txt") if verbose : toolViewStructDataset (d_dataset) # select reference # remove RX and same chain p_dir_align = pathManage.result(ligand_ID + "/datasetBuilding/aligmentRef") filterReferenceByOne (d_dataset, p_dir_align, ligand_ID, thresold_RX = thresold_RX) if verbose : toolViewStructDataset (d_dataset) # conserve only unique protein filterGlobalDataset (d_dataset, p_dir_align) if verbose : toolViewStructDataset (d_dataset) # run blast by sequence conserved p_dir_blast = pathManage.result(ligand_ID + "/datasetBuilding/blast") RunBlast.globalRun (d_dataset, p_dir_blast) if verbose : toolViewStructDataset (d_dataset) # filter by e-value and RX filterBlastResult (d_dataset, p_dir_dataset,ligand_ID, thresold_RX = thresold_RX, thresold_blast = thresold_blast) if verbose : toolViewStructDataset (d_dataset) # clean folder dataset cleanFolderDataset (d_dataset, p_dir_dataset)
def resolutionByStructure (name_dataset) : l_structure = structure.ListSub() l_path = [] for strut in l_structure : l_path.append (pathManage.result(name_dataset) + "water_" + strut + ".dat") filout = open (pathManage.result(name_dataset) + "water_" + strut + ".dat", "w") l_file_summary = pathManage.retrieveSummaryFile (strut, name_dataset) list_global = [] for path_summary in l_file_summary : print path_summary list_interest_atom = loadFile.loadSummary (path_summary) for interest_atom in list_interest_atom : if not interest_atom in list_global : list_global.append (interest_atom) for atom_interest in list_global : rx, i_nb_atom, s_PDB = searchCountH2O(atom_interest) filout.write ("%s\t%s\t%s\n"%(s_PDB, rx, i_nb_atom)) filout.close () return l_path
def findFamilyAndGroup (PDB_in, Identity = "30.0") : p_family_group = pathManage.result ("clasifRef") + "groupIdentity_" + str (Identity) + ".txt.filter" filin = open (p_family_group, "r") l_line_flin = filin.readlines () filin.close () for line_filin in l_line_flin [1:]: l_el = line_filin.strip ().split ("\t") PDB_ID = l_el [0] family = tool.NameFamily (l_el[2]) group = str (l_el[1]) if PDB_in == PDB_ID : return group, family
def manageResult (l_ligand, name_final, l_out = []): pr_result = pathManage.result("final_" + name_final) # remove the folder # pr_pi = pathManage.result("final/phosphates") # pr_ribose = pathManage.result("final/ribose") for name_lig in l_ligand : l_p_smile = pathManage.findListSmileFile(name_lig) p_file_famile = pathManage.findFamilyFile (name_lig) for p_smile in l_p_smile : if search("ribose", p_smile) and search(".txt", p_smile) and search("smile", p_smile): arrangeResult.globalArrangement(pr_result, p_smile, p_file_famile, name_lig, l_out) elif search("smile", p_smile) and search(".txt", p_smile) : arrangeResult.globalArrangement(pr_result, p_smile, p_file_famile, name_lig, l_out) return 1
def manageResult(l_ligand, name_final, l_out=[]): pr_result = pathManage.result("final_" + name_final) # remove the folder # pr_pi = pathManage.result("final/phosphates") # pr_ribose = pathManage.result("final/ribose") for name_lig in l_ligand: l_p_smile = pathManage.findListSmileFile(name_lig) p_file_famile = pathManage.findFamilyFile(name_lig) for p_smile in l_p_smile: if search("ribose", p_smile) and search( ".txt", p_smile) and search("smile", p_smile): arrangeResult.globalArrangement(pr_result, p_smile, p_file_famile, name_lig, l_out) elif search("smile", p_smile) and search(".txt", p_smile): arrangeResult.globalArrangement(pr_result, p_smile, p_file_famile, name_lig, l_out) return 1
def analyseLGDProximity(prclassif): print(prclassif) nameREF = prclassif.split("/")[-1] print(nameREF) prout = pathManage.result(nameREF + "_LGDsimilarity") print(prout) # extract IC550 for PDB and ligand pbindingDBfiltered = prout + "bindingDBfiltered.txt" lkeep = [ "PDB ID(s) for Ligand-Target Complex", "Ligand HET ID in PDB", "Kd (nM)", "Ki (nM)", "IC50 (nM)"] parseTSV.TSVFiltered(PBINDINGDB, lkeep, pfilout=pbindingDBfiltered) # extract for each reference LGD extractLGDfile(prclassif, prout) buildMatrixSimilarity(prout, pfileaffinity=pbindingDBfiltered, MCS=1, Sheap=0) # extract MMP extractMMP(prout)
def globalShaepStat (substruct): pr_result = pathManage.result(substruct) p_filout = pr_result + "shaep_global.txt" filout = open (p_filout, "w") filout.write ("best_similarity\tshape_similarity\tESP_similarity\n") l_folder = listdir(pr_result) for ref_folder in l_folder : if not path.isdir(pr_result + ref_folder + "/") : continue l_file_result = listdir(pr_result + ref_folder + "/") for file_result in l_file_result : if search(".hit", file_result) : d_shaep_parsed = parseShaep.parseOutputShaep(pr_result + ref_folder + "/" + file_result) if d_shaep_parsed != {} : filout.write (ref_folder + "_" + file_result[10:-4] + "\t" + str(d_shaep_parsed["best_similarity"]) + "\t" + str(d_shaep_parsed["shape_similarity"]) + "\t" + str(d_shaep_parsed["ESP_similarity"]) + "\n") filout.close () runOtherSoft.RhistogramMultiple (p_filout, "Shaep_score")
def classifRefProtein (pr_dataset, l_lig, thresold_identity = 30.0, thresold_similarity = 30.0): pr_out = pathManage.result("clasifRef") # case fasta file pr_align_seq = pathManage.generatePath(pr_out + "alignSeq/") l_p_fasta = [] for lig in l_lig : pr_dataset = pathManage.dataset(lig) l_file_by_lig = listdir(pr_dataset) l_pr_ref_by_lig =[pr_dataset + x for x in l_file_by_lig] for pr_ref_by_lig in l_pr_ref_by_lig : PDB_folder = pr_ref_by_lig.split ("/")[-1] try : l_file = listdir(pr_ref_by_lig) except : continue for file_ref in l_file : if search("^" + PDB_folder, file_ref) : PDB_ID = file_ref[0:-4] PDB_ID = PDB_ID[0:4].lower () + PDB_ID[4:] # PDB ID with chain associated p_fasta = downloadFile.importFasta(PDB_ID, pr_align_seq, dir_by_PDB = 0, debug = 1, fastaGlobal = "/home/borrel/Yue_project/pdb_seqres.txt") l_p_fasta.append (p_fasta) break d_outNeedle = applyNeedleList (l_p_fasta, pr_align_seq) # writeMatrix writeMatrixFromDico (d_outNeedle, pr_out + "matrixSimilarSeq", "similarity" ) writeMatrixFromDico (d_outNeedle, pr_out + "matrixIDSeq", "identity" ) #Group reference -> l 209 p_group_id = GroupRef (d_outNeedle, "identity", pr_out + "groupIdentity" +"_" + str (thresold_identity) + ".txt", thresold_identity, l_lig) p_group_sim = GroupRef (d_outNeedle, "similarity", pr_out + "groupSimilarity" +"_" + str (thresold_similarity) + ".txt", thresold_similarity, l_lig) # merge not alone prot MergeGroup (p_group_id) MergeGroup (p_group_sim)
def waterGlobal (name_database, limit_acc = 00.0): """ Number of water molecules in PDB arg: -> Path folder database -> name folder result -> limit acc return: NONE """ pr_result = pathManage.result (name_database + "/water") # retrieve list PDB file l_PDBID = managePDB.retriveListPDB(name_database) # calcul acc with NACESS if limit_acc != 0.0 : for PDB_ID in l_PDBID : p_PDB = pathManage.pathDitrectoryPDB () + PDB_ID + ".pdb" runOtherSoft.runNACESS(p_PDB, pathManage.pathDitrectoryPDB (), multi_run = 0) p_filout = waterAnalysis.resolutionWater(l_PDBID, pr_result, limit_acc) runScriptR.waterPlotResolution (p_filout)
def GlobalBondLength (name_database, RX_thresold = 1.5): # directory pr_result = pathManage.result(name_database + "/CXbound" + str (RX_thresold)) pr_database = pathManage.result(name_database) # filout with distance p_CN = pr_result + "distanceCN" p_CO = pr_result + "distanceCO" p_CC = pr_result + "distanceCC" p_coplar = pr_result + "distanceCoplar" filout_CN = open (p_CN, "w") filout_CO = open (p_CO, "w") filout_CC = open (p_CC, "w") filout_coplar = open (p_coplar, "w") # load PDB with logand if not path.exists(pr_database + "resultLigandInPDB") : print "ERROR => file with ligand and PDB does not exist" return else : d_lig_PDB = loadFile.LigandInPDB(pr_database + "resultLigandInPDB") nb_lig = len(d_lig_PDB.keys()) print d_lig_PDB.keys() i = 0 while (i < nb_lig): name_lig = d_lig_PDB.keys()[i] l_PDB = d_lig_PDB[name_lig] for PDB in l_PDB : # controle RX RX = parsing.Quality(PDB)[0] # print RX if RX <= RX_thresold : l_atom_lig = loadFile.ligandInPDBConnectMatrixLigand(PDB, name_lig) l_distCN = BondLengthCandX (l_atom_lig, "N") l_distCO = BondLengthCandX (l_atom_lig, "O") l_distCC = BondLengthCandX (l_atom_lig, "C") l_coplarIII = CoplanarityIII(l_atom_lig) if l_distCN != [] : filout_CN.write ("\n".join (l_distCN) + "\n") if l_distCO != [] : filout_CO.write ("\n".join (l_distCO) + "\n") if l_distCC != [] : filout_CC.write ("\n".join (l_distCC) + "\n") if l_coplarIII != [] : filout_coplar.write ("\n".join (l_coplarIII) + "\n") # take only one PDB by ligand not more i = i + 1 continue i = i + 1 filout_CO.close () filout_CN.close () filout_CC.close () filout_coplar.close () runScriptR.histDistance(p_CN, "CN") runScriptR.histDistance(p_CO, "CO") runScriptR.histDistance(p_CC, "CC") runScriptR.histDistance(p_coplar, "coplar")
####################### # CLASSIFICATION LSRs # ####################### name_folder_final = "withoutLig" # manageResult (["AMP", "ADP", "POP", "ATP"], name_folder_final, l_ligand_out) # arrangeResult.qualityExtraction (["AMP", "ADP", "POP", "ATP"], name_folder_final, p_list_ligand = "/home/borrel/Yue_project/resultLigandInPDB", thresold_sheap = thresold_shaep) #arrangeResult.countingSubstituent(name_folder_final) ################################################### # AFFINITY AND INTERACTIONS BY PROTEIN REFERENCE # ################################################### # folder final pr_classif = pathManage.result("final_" + name_folder_final) + "Pi_LSR" ligandSimilarity.analyseLGDProximity(pr_classif) ######################################### # ANALYSE CLASSIFICATION BASED ON SHEAP # ######################################### #classifResults.SheapScoreToClass(pr_classif) ###################### # ANALYSE REFERENCE # ######################
def retrieveSubstructSuperimposed (name_lig, thresold_BS = 4.5, thresold_superimposed_ribose = 2.5, thresold_superimposed_pi = 3, thresold_shaep = 0.4): # ouput p_dir_dataset = pathManage.dataset(name_lig) p_dir_result = pathManage.result(name_lig ) l_folder_ref = listdir(p_dir_dataset) # log control p_log = open(p_dir_result + "log_superimposed.txt", "w") # control extraction d_control = {} d_control["pr ref"] = 0 d_control["lig query"] = 0 d_control["subref"] = {} d_control["subref empty"] = {} d_control["out sheap"] = {} filout_control = open (p_dir_result + "quality_extraction.txt", "w") # stock smile code d_smile = {} # sheap control d_filout_sheap = {} d_filout_sheap ["list"] = [p_dir_result + "shaep_global.txt"] d_filout_sheap["global"] = open (p_dir_result + "shaep_global.txt", "w") d_filout_sheap["global"].write ("name\tbest_similarity\tshape_similarity\tESP_similarity\n") for ref_folder in l_folder_ref : # control folder reference name if len (ref_folder) != 4 : p_log.write ("[ERROR folder] -> " + ref_folder + "\n") continue # reference p_lig_ref = pathManage.findligandRef(p_dir_dataset + ref_folder + "/", name_lig) try: lig_ref_parsed = parsePDB.loadCoordSectionPDB(p_lig_ref, "HETATM") # print len (lig_ref_parsed) except: p_log.write ("[ERROR ligand ref] -> " + p_lig_ref + "\n") continue #control d_control["pr ref"] = d_control["pr ref"] + 1 # output by reference p_dir_result_ref = pathManage.result(name_lig + "/" + ref_folder) d_filout_superimposed = {} d_filout_superimposed["global"] = open (p_dir_result_ref + "all_ligand_aligned.pdb", "w") d_filout_superimposed["sheap"] = open (p_dir_result_ref + "all_ligand_aligned_" + str (thresold_shaep) + ".pdb", "w") # write lig ref -> connect matrix corrrect in all reference and all sheap writePDBfile.coordinateSection(d_filout_superimposed["global"], lig_ref_parsed, "HETATM", connect_matrix = 1) writePDBfile.coordinateSection(d_filout_superimposed["sheap"], lig_ref_parsed, "HETATM", connect_matrix = 1) # inspect folder dataset l_pdbfile = listdir(p_dir_dataset + ref_folder + "/") for pdbfile in l_pdbfile : # no ligand file if len (pdbfile.split ("_")) == 1 : continue pdbfile = pdbfile[:-4] # remove extention if len(pdbfile.split ("_")[0]) == 3 and len(pdbfile.split ("_")[1]) == 4 and pdbfile.split ("_")[1] != ref_folder: p_lig = p_dir_dataset + ref_folder + "/" + pdbfile + ".pdb" if p_lig_ref != p_lig : # pass case where ligand replace same ligand -> does not need run if pdbfile.split ("_")[0] == name_lig : p_log.write ("[REMOVE] -> same ligand substituate") continue # parsed ligand query lig_parsed = parsePDB.loadCoordSectionPDB(p_lig, "HETATM") # find matrix of rotation p_matrix = pathManage.findMatrix(p_lig_ref, p_lig, name_lig) # control file matrix exist if not path.exists(p_matrix) : p_log.write ("[ERROR] -> Matrix transloc " + p_lig_ref + " " + p_lig + " " + name_lig + "\n") continue # control d_control["lig query"] = d_control["lig query"] + 1 # find the path of complex used p_complex = p_dir_dataset + ref_folder + "/" + p_lig.split ("/")[-1][4:] # ligand rotated -> change the referentiel superposeStructure.applyMatrixLigand(lig_parsed, p_matrix) # use substruct l_p_substruct_ref = pathManage.findSubstructRef (pathManage.dataset(name_lig) + ref_folder + "/" , name_lig) for p_substruct_ref in l_p_substruct_ref : # ribose or phosphate struct_type = p_substruct_ref.split ("_")[-2] substruct_parsed = parsePDB.loadCoordSectionPDB(p_substruct_ref, "HETATM") l_atom_substituate = neighborSearch.searchNeighborAtom(substruct_parsed, lig_parsed, struct_type, p_log, thresold_superimposed_ribose = thresold_superimposed_ribose, thresold_superimposed_pi = thresold_superimposed_pi) # control find if len (l_atom_substituate) == 0 : if not struct_type in d_control["subref empty"].keys () : d_control["subref empty"][struct_type] = 1 else : d_control["subref empty"][struct_type] = d_control["subref empty"][struct_type] + 1 continue else : if not struct_type in d_control["subref"].keys () : d_control["subref"][struct_type] = 1 else : d_control["subref"][struct_type] = d_control["subref"][struct_type] + 1 # write PDB file, convert smile p_substituate_pdb = p_dir_result_ref + "substituent_" + pdbfile.split ("_")[0] + "_" + pdbfile.split ("_")[1] + "_" + struct_type + ".pdb" writePDBfile.coordinateSection(p_substituate_pdb, l_atom_substituate, recorder="HETATM", header=0, connect_matrix = 1) # sheap reference on part of ligand p_sheap = runOtherSoft.runShaep (p_substruct_ref, p_substituate_pdb, p_substituate_pdb[0:-4] + ".hit", clean = 0) val_sheap = parseShaep.parseOutputShaep (p_sheap) if val_sheap == {} : p_log.write ("[ERROR] -> ShaEP " + p_substituate_pdb + " " + p_substruct_ref + "\n") if not struct_type in d_control["out sheap"].keys () : d_control["out sheap"][struct_type] = 1 else : d_control["out sheap"][struct_type] = d_control["out sheap"][struct_type] + 1 continue # control thresold sheap if not struct_type in d_filout_sheap.keys () : d_filout_sheap[struct_type] = {} d_filout_sheap[struct_type] = open (p_dir_result + "shaep_global_" + struct_type + ".txt", "w") d_filout_sheap[struct_type].write ("name\tbest_similarity\tshape_similarity\tESP_similarity\n") d_filout_sheap["list"].append (p_dir_result + "shaep_global_" + struct_type + ".txt") # to improve with python function # write value in ShaEP control d_filout_sheap[struct_type].write (ref_folder + "_" + str(pdbfile.split ("_")[1]) + "_" + struct_type + "_" + str (pdbfile.split ("_")[0]) + "\t" + str(val_sheap["best_similarity"]) + "\t" + str(val_sheap["shape_similarity"]) + "\t" + str(val_sheap["ESP_similarity"]) + "\n") d_filout_sheap["global"].write (ref_folder + "_" + str(pdbfile.split ("_")[1]) + "_" + struct_type + "_" + str (pdbfile.split ("_")[0]) + "\t" + str(val_sheap["best_similarity"]) + "\t" + str(val_sheap["shape_similarity"]) + "\t" + str(val_sheap["ESP_similarity"]) + "\n") # rename file substituent with shaEP value rename(p_substituate_pdb, p_substituate_pdb[:-4] + "_" + str (val_sheap["best_similarity"]) + ".pdb") # rename and change the file name p_substituate_pdb = p_substituate_pdb[:-4] + "_" + str (val_sheap["best_similarity"]) + ".pdb" # write all substruct in global file writePDBfile.coordinateSection(d_filout_superimposed["global"], lig_parsed, recorder= "HETATM", header = str(p_lig.split ("/")[-1]) + "_" + str (val_sheap["best_similarity"]) , connect_matrix = 1) # control sheap thresold if float(val_sheap["best_similarity"]) >= thresold_shaep : # write subligand superimposed selected in global files writePDBfile.coordinateSection(d_filout_superimposed["sheap"], lig_parsed, recorder= "HETATM", header = str(p_lig.split ("/")[-1]) + "_" + str (val_sheap["best_similarity"]) , connect_matrix = 1) ############ # write BS # ############ # not only protein superimposed -> also ion and water l_atom_complex = parsePDB.loadCoordSectionPDB(p_complex) superposeStructure.applyMatrixProt(l_atom_complex, p_matrix) p_file_cx = p_dir_result_ref + "CX_" + p_lig.split ("/")[-1] # write CX writePDBfile.coordinateSection(p_file_cx, l_atom_complex, recorder="ATOM", header= p_lig.split ("/")[-1], connect_matrix = 0) # search atom in BS l_atom_binding_site = [] for atom_complex in l_atom_complex : for atom_substruct in lig_parsed : if parsePDB.distanceTwoatoms (atom_substruct, atom_complex) <= thresold_BS : if not atom_complex in l_atom_binding_site : l_atom_binding_site.append (deepcopy(atom_complex)) # 3. retrieve complet residue l_atom_BS_res = parsePDB.getResidues(l_atom_binding_site, l_atom_complex) # 4. write binding site p_binding = p_dir_result_ref + "BS_" + p_lig.split ("/")[-1] writePDBfile.coordinateSection(p_binding, l_atom_BS_res, "ATOM", p_binding, connect_matrix = 0) # smile code substituate analysis # Step smile -> not conversion if shaep not validate smile_find = runOtherSoft.babelConvertPDBtoSMILE(p_substituate_pdb) if not struct_type in d_smile.keys () : d_smile[struct_type] = {} d_smile[struct_type][smile_find] = {} d_smile[struct_type][smile_find]["count"] = 1 d_smile[struct_type][smile_find]["PDB"] = [pdbfile.split ("_")[1]] d_smile[struct_type][smile_find]["ligand"] = [pdbfile.split ("_")[0]] d_smile[struct_type][smile_find]["ref"] = [ref_folder] else : if not smile_find in d_smile[struct_type].keys () : d_smile[struct_type][smile_find] = {} d_smile[struct_type][smile_find]["count"] = 1 d_smile[struct_type][smile_find]["PDB"] = [pdbfile.split ("_")[1]] d_smile[struct_type][smile_find]["ligand"] = [pdbfile.split ("_")[0]] d_smile[struct_type][smile_find]["ref"] = [ref_folder] else : d_smile[struct_type][smile_find]["count"] = d_smile[struct_type][smile_find]["count"] + 1 d_smile[struct_type][smile_find]["PDB"].append (pdbfile.split ("_")[1]) d_smile[struct_type][smile_find]["ligand"].append (pdbfile.split ("_")[0]) d_smile[struct_type][smile_find]["ref"].append (ref_folder) else : if not struct_type in d_control["out sheap"].keys () : d_control["out sheap"][struct_type] = 1 else : d_control["out sheap"][struct_type] = d_control["out sheap"][struct_type] + 1 tool.closeDicoFile (d_filout_superimposed) # sheap control tool.closeDicoFile (d_filout_sheap) for p_file_sheap in d_filout_sheap["list"] : runOtherSoft.RhistogramMultiple (p_file_sheap) # write list of smile for substruct in d_smile.keys () : p_list_smile = pathManage.result(name_lig) + "list_" + substruct + "_" + str (thresold_shaep) + "_smile.txt" filout_smile = open (p_list_smile, "w") for smile_code in d_smile[substruct].keys () : l_lig = d_smile[substruct][smile_code]["ligand"] l_PDB = d_smile[substruct][smile_code]["PDB"] l_ref = d_smile[substruct][smile_code]["ref"] filout_smile.write (str (smile_code) + "\t" + str (d_smile[substruct][smile_code]["count"]) + "\t" + " ".join (l_PDB) + "\t" + " ".join (l_ref) + "\t" + " ".join(l_lig) + "\n") filout_smile.close () p_log.close () # control filout_control.write ("NB ref: " + str(d_control["pr ref"]) + "\n") filout_control.write ("Ligand query: " + str(d_control["lig query"]) + "\n") for k in d_control["subref"].keys () : filout_control.write ("LSR " + str (k) + ": " + str(d_control["subref"][k]) + "\n") for k in d_control["subref empty"].keys () : filout_control.write ("NB LSR empty " + str (k) + ": " + str(d_control["subref empty"][k]) + "\n") for k in d_control["out sheap"].keys () : filout_control.write ("LSR out by sheap " + str (k) + ": " + str(d_control["out sheap"][k]) + "\n") filout_control.write ("**********************\n\n") for k in d_control["subref"].keys () : filout_control.write ("LSR keep" + str (k) + ": " + str(d_control["subref"][k] - d_control["out sheap"][k]) + "\n") filout_control.close () return 1
def enantiomer(l_ligand, name_folder_final, debug = 1) : "to do file output" pr_final = pathManage.result("final_" + name_folder_final) pr_enantiomer = pathManage.generatePath(pr_final + "enantiomer/") l_ref = [] d_filout = {} for ligand in l_ligand : d_filout[ligand] = {} d_filout[ligand]["O3OP"]= open (pr_enantiomer + ligand + "_" + "O3OP" , "w") d_filout[ligand]["O4O5"]= open (pr_enantiomer + ligand + "_" + "O4O5" , "w") d_filout[ligand]["OPOP"]= open (pr_enantiomer + ligand + "_" + "OPOP" , "w") l_pr_type_ref = listdir(pr_final) for pr_type_ref in l_pr_type_ref : if debug : print "1", pr_type_ref # case where pr_substruct is a file not a folder try : l_pr_sub = listdir(pr_final + pr_type_ref + "/") except : continue for pr_sub in l_pr_sub : print "2", pr_sub # case cycle -> append in list respertory with new folder if pr_sub == "cycle" : l_pr_sub.remove ("cycle") l_pr_sub_cycle = listdir (pr_final + pr_type_ref + "/cycle") for pr_sub_cycle in l_pr_sub_cycle : l_pr_sub.append ("cycle/" + pr_sub_cycle) break for pr_sub in l_pr_sub : try : l_pr_ref = listdir (pr_final + pr_type_ref + "/" + pr_sub) except : pass if debug : print "3", pr_sub for pr_ref in l_pr_ref : if debug : print "4", pr_ref # case no folder try : l_file = listdir(pr_final + pr_type_ref + "/" + pr_sub + "/" + pr_ref + "/LGD/") except : continue for name_file in l_file : if search("LGD_REF_A",name_file) and search(".pdb",name_file): #print "2222", l_ref if name_file.split("_")[3][:4] in l_ref : print "!!!!!", "IN" break else : l_ref.append (name_file.split ("_")[3][:4]) ligand = name_file.split ("_")[2] l_atom_ligand = parsePDB.loadCoordSectionPDB(pr_final + pr_type_ref + "/" + pr_sub + "/" + pr_ref + "/LGD/" + name_file, "HETATM") d_minO3OP = 100 for atom_ligand in l_atom_ligand : if atom_ligand["name"] == "O4'" : atom_O4 = atom_ligand elif atom_ligand["name"] == "O5'" : atom_O5 = atom_ligand elif atom_ligand["name"] == "O3'" : atom_O3 = atom_ligand elif atom_ligand["name"] == "O1A" : atom_O1A = atom_ligand elif atom_ligand["name"] == "O2A" : atom_O2A = atom_ligand elif atom_ligand["name"] == "O1B" : atom_O1B = atom_ligand elif atom_ligand["name"] == "O2B" : atom_O2B = atom_ligand #elif atom_ligand["name"] == "O3B" : # atom_O3B = atom_ligand # d O4 - O5 try : d_O4O5 = parsePDB.distanceTwoatoms(atom_O4, atom_O5) except : continue d_filout[ligand]["O4O5"].write (pr_ref + "_" + pr_type_ref + "\t" + str (d_O4O5) + "\n") # d O3 - OP for atom_ligand in l_atom_ligand : if ligand == "AMP" : if atom_ligand["name"] == "O1P" or atom_ligand["name"] == "O2P" or atom_ligand["name"] == "O3P" : d_tempO3OP = parsePDB.distanceTwoatoms(atom_O3, atom_ligand) if d_tempO3OP < d_minO3OP : d_minO3OP = d_tempO3OP atom_tempO3OP = deepcopy(atom_ligand) else : if atom_ligand["name"] == "O1A" or atom_ligand["name"] == "O2A" or atom_ligand["name"] == "O3A" : d_tempO3OP = parsePDB.distanceTwoatoms(atom_O4, atom_ligand) if d_tempO3OP < d_minO3OP : d_minO3OP = d_tempO3OP atom_tempO3OP = deepcopy(atom_ligand) d_filout[ligand]["O3OP"].write (pr_ref + "_" + pr_type_ref +"_" + str(atom_tempO3OP["name"]) + "\t" + str (d_minO3OP) + "\n") # d OP OP d_OP = {} if ligand == "ATP" or ligand == "ADP" : d_OP ["O1AO1B"] = parsePDB.distanceTwoatoms(atom_O1A, atom_O1B) d_OP ["O1AO2B"] = parsePDB.distanceTwoatoms(atom_O1A, atom_O2B) #d_OP ["O1AO3B"] = parsePDB.distanceTwoatoms(atom_O1A, atom_O3B) d_OP ["O2AO1B"] = parsePDB.distanceTwoatoms(atom_O2A, atom_O1B) d_OP ["O2AO2B"] = parsePDB.distanceTwoatoms(atom_O2A, atom_O2B) #d_OP ["O2AO3B"] = parsePDB.distanceTwoatoms(atom_O2A, atom_O3B) d_minOPOP = min (d_OP.values()) #print d_minOPOP k_min = [name for name, age in d_OP.items() if age == min (d_OP.values())][0] #print k_min d_filout[ligand]["OPOP"].write (pr_ref + "_" + pr_type_ref + "_" + str(k_min) + "\t" + str (d_minOPOP) + "\n") try : del d_OP del atom_O1A del atom_O1B del atom_O2A del atom_O2B except : pass try : del atom_O3 del atom_O4 del atom_O5 except : pass # close files for lig in l_ligand : for type_dist in d_filout[lig].keys () : p_file = d_filout[lig][type_dist].name d_filout[lig][type_dist].close () runOtherSoft.Rhistogram(p_file, type_dist, brk = 20)
def superpositionAllRef (l_ligand, name_folder_final, debug = 1): pr_final = pathManage.result("final_" + name_folder_final) pr_align = pathManage.generatePath(pr_final + "refAlignement/") l_ref = [] d_filout_pdb = {} d_filout_RMSE = {} d_ref = {} l_file_RMSE = [] for ligand in l_ligand : d_filout_pdb[ligand] = open (pr_align + ligand + "_" + "superimposed.pdb" , "w") d_filout_RMSE[ligand] = open (pr_align + ligand + "_" + "RMSE.txt" , "w") l_file_RMSE.append (pr_align + ligand + "_" + "RMSE.txt") l_pr_type_ref = listdir(pr_final) for pr_type_ref in l_pr_type_ref : if debug : print "1", pr_type_ref # case where pr_substruct is a file not a folder try : l_pr_sub = listdir(pr_final + pr_type_ref + "/") except : continue for pr_sub in l_pr_sub : print "2", pr_sub # case cycle -> append in list respertory with new folder if pr_sub == "cycle" : l_pr_sub.remove ("cycle") l_pr_sub_cycle = listdir (pr_final + pr_type_ref + "/cycle") for pr_sub_cycle in l_pr_sub_cycle : l_pr_sub.append ("cycle/" + pr_sub_cycle) break for pr_sub in l_pr_sub : try : l_pr_ref = listdir (pr_final + pr_type_ref + "/" + pr_sub) except : pass if debug : print "3", pr_sub for pr_ref in l_pr_ref : if debug : print "4", pr_ref # case no folder try : l_file = listdir(pr_final + pr_type_ref + "/" + pr_sub + "/" + pr_ref + "/LGD/") except : continue for name_file in l_file : if search("LGD_REF_A",name_file) and search(".pdb",name_file): #print "2222", l_ref if name_file.split("_")[3][:4] in l_ref : print "!!!!!", "IN" break else : l_ref.append (name_file.split ("_")[3][:4]) ligand = name_file.split ("_")[2] l_atom_ligand = parsePDB.loadCoordSectionPDB(pr_final + pr_type_ref + "/" + pr_sub + "/" + pr_ref + "/LGD/" + name_file, "HETATM", remove_H=1) l_atom_adenine = substructTools.retrieveAdenine(l_atom_ligand) if not ligand in d_ref.keys () : # stock in tempory dictionary for the reference d_ref[ligand] = [] d_ref[ligand].append (l_atom_ligand) d_ref[ligand].append (l_atom_adenine) writePDBfile.coordinateSection(d_filout_pdb[ligand], l_atom_ligand, "HETATM", connect_matrix = 1) continue else : rotation, translocation = superimpose.rigid_transform_3D(l_atom_adenine, d_ref[ligand][-1]) if rotation == None or translocation == None : continue # rotation + translation l_atom_lig_rotated = superimpose.applyTranformation(rotation, translocation, l_atom_in=l_atom_ligand) # write PDB file and RMSE # print "============" # print ligand, pr_ref # print len (l_atom_lig_rotated) # print len (d_ref[ligand][0]) # print "============" if len (l_atom_lig_rotated) != len (d_ref[ligand][0]) : continue writePDBfile.coordinateSection(d_filout_pdb[ligand], l_atom_lig_rotated, "HETATM", connect_matrix = 1) RMSE_ligand = superimpose.rmse(d_ref[ligand][0], l_atom_lig_rotated) d_filout_RMSE[ligand].write (str (pr_ref) + pr_type_ref + "\t" + str(RMSE_ligand) + "\n") # close files for lig in d_filout_pdb.keys () : d_filout_pdb[lig].close () d_filout_RMSE[lig].close () for file_RMSE in l_file_RMSE : runOtherSoft.Rhistogram(file_RMSE, "RMSE_Adenine")
def qualityExtraction (l_ligand, name_folder, p_list_ligand, thresold_sheap) : pr_result = pathManage.result("final_" + name_folder) filout = open(pr_result + "quality_extraction.txt", "w") # number PDB by ligand, without filter filout.write ("Number PDB by ligand:\n") d_dataset = tool.parseLigandPDBList(p_list_ligand) for ligand in l_ligand : filout.write (str (ligand) + ": " + str (len (d_dataset[ligand])) + "\n") # number references filout.write ("\n*************\n\nNumber references by ligands:\n") for ligand in l_ligand : pr_result_ligand = pathManage.result(ligand) nb_ref = -2 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) : nb_ref = nb_ref + 1 filout.write (ligand + ": " + str (nb_ref) + "\n") # number of query by ref in means and max and min (after blast) filout.write ("\n*************\n\nNumber means queries by references:\n") p_family_all = pathManage.result() + "reference_family_all.txt" filout_family_all = open (p_family_all, "w") d_family_all = {} for ligand in l_ligand : d_nb_query = {} d_family = {} p_filout_family = pathManage.result() + "reference_family_" + ligand + ".txt" p_filout_family_count = pathManage.result () + "count_family_" + ligand + ".txt" filout_family = open (p_filout_family, "w") filout_family_count = open (p_filout_family_count, "w") pr_result_ligand = pathManage.result(ligand) nb_ref = 0 l_file = listdir(pr_result_ligand) for f in l_file : if path.isdir (pr_result_ligand + "/" + f) and len (f) == 4: # count by family family_ref = analysis.findFamily(f, pathManage.findFamilyFile (ligand)) filout_family.write ("\t".join (family_ref) + "\n") if not family_ref[-1] in d_family.keys () : d_family[family_ref[-1]] = 0 d_family[family_ref[-1]] = d_family[family_ref[-1]] + 1 # file all if not family_ref[-1] in d_family_all.keys () : d_family_all[family_ref[-1]] = 0 d_family_all[family_ref[-1]] = d_family_all[family_ref[-1]] + 1 # count number of references nb_ref = nb_ref + 1 d_nb_query[f] = 0 l_file_queries = listdir(pr_result_ligand + "/" + f + "/") for file_query in l_file_queries : if search ("CX",file_query) : d_nb_query[f] = d_nb_query[f] + 1 filout.write (ligand + ": " + str(np.sum(d_nb_query.values ())) + "\n") filout.write (ligand + ": " + str(np.mean(d_nb_query.values ())) + "+/-" + str(np.std (d_nb_query.values ())) + "\n") filout.write ("MAX " + str (ligand) + ": " + str (max (d_nb_query.values ())) + " " + str (d_nb_query.keys ()[d_nb_query.values ().index (max (d_nb_query.values ()))]) +"\n") # family filout_family_count.write ("\t".join(d_family.keys ()) + "\n") l_values = [str(x) for x in d_family.values ()] filout_family_count.write ("\t".join(l_values) + "\n") filout_family.close () filout_family_count.close () runOtherSoft.piePlot(p_filout_family_count) # all family filout_family_all.write ("\t".join(d_family_all.keys ()) + "\n") l_values = [str(x) for x in d_family_all.values ()] filout_family_all.write ("\t".join(l_values) + "\n") filout_family_all.close () runOtherSoft.piePlot(p_family_all) # number subref by ligand filout.write ("\n*************\n\nNumber of subref considered:\n") for ligand in l_ligand : d_nb_sub = {} d_nb_sub_sheap = {} pr_result_ligand = pathManage.result(ligand) l_ref = listdir(pr_result_ligand) for ref in l_ref : if path.isdir (pr_result_ligand + "/" + ref) and len (ref) == 4: l_file_queries = listdir(pr_result_ligand + "/" + ref + "/") for file_query in l_file_queries : if search ("substituent",file_query) and search (".pdb",file_query): atom_substituate = file_query.split ("_")[-2] try : value_sheap = float(file_query.split ("_")[-1][:-4]) except : continue if not atom_substituate in d_nb_sub.keys () : d_nb_sub[atom_substituate] = 0 d_nb_sub[atom_substituate] = d_nb_sub[atom_substituate] + 1 if value_sheap > thresold_sheap : if not atom_substituate in d_nb_sub_sheap : d_nb_sub_sheap[atom_substituate] = 0 d_nb_sub_sheap[atom_substituate] = d_nb_sub_sheap[atom_substituate] + 1 filout.write ("\n" + ligand + "\n") for atom_substituate in d_nb_sub.keys () : filout.write (atom_substituate + ": " + str (d_nb_sub[atom_substituate]) + "\n") try : filout.write (atom_substituate + " ShaEP: " + str (d_nb_sub_sheap[atom_substituate]) + "\n") except : filout.write (atom_substituate + " ShaEP: 0\n") filout.close()
def countingSubstituent (name_final, debug = 1): pr_final_folder = pathManage.result("final_" + name_final) d_count = {} d_lig = {} d_by_ref = {} d_count_pr = {} l_file_final = listdir(pr_final_folder) if debug : print "1", pr_final_folder for pr_type_subref in l_file_final : # case where pr type is a file not a folder try : l_pr_sub = listdir(pr_final_folder + pr_type_subref + "/") except : continue if debug: print "2",pr_final_folder + pr_type_subref + "/" # case cycle append one directory if "cycle" in l_pr_sub : l_pr_sub.remove ("cycle") l_second_sub = listdir (pr_final_folder + pr_type_subref + "/cycle/") for second_sub in l_second_sub : l_pr_sub.append ("cycle/" + second_sub) for pr_sub in l_pr_sub : # case where pr_type_substituent is a folder try : l_pr_PDBref = listdir(pr_final_folder + pr_type_subref + "/" + pr_sub + "/") except : continue if debug : print "3", pr_final_folder + pr_type_subref, pr_sub for pr_PDBref in l_pr_PDBref : PDB_ref = pr_PDBref.split ("_")[-1] family_ref = pr_PDBref.split ("-")[0] group_ref = pr_PDBref.split ("_")[0].split ("-")[-1] pr_LGD = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LGD/" pr_LSR = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LSR/" pr_BS = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/BS/" if debug : print "4",pr_LGD print "4", pr_BS print "4", pr_LSR ################ # folder LSR # ################ l_file_LSR = listdir (pr_LSR) for file_LSR in l_file_LSR : # -> count by type sub reference if search ("LSR_", file_LSR) and file_LSR.split ("_")[1] != "REF" : ligand_sub = file_LSR.split ("_")[1] if debug : print "5", file_LSR if not ligand_sub in d_count.keys () : d_count[ligand_sub] = {} if not pr_sub in d_count[ligand_sub].keys () : d_count[ligand_sub][pr_sub] = 0 d_count[ligand_sub][pr_sub] = d_count[ligand_sub][pr_sub] + 1 ################ # complet LSR # ################ elif search ("LSR", file_LSR): # case LSR reference # ###################### if search ("REF_", file_LSR) : lig_ref = file_LSR.split ("_")[2][:3] if not lig_ref in d_by_ref.keys () : d_by_ref[lig_ref] = {} type_ref = pr_type_subref.split ("_")[0] if not type_ref in d_by_ref[lig_ref].keys () : d_by_ref[lig_ref][type_ref] = 0 d_by_ref[lig_ref][type_ref] = d_by_ref[lig_ref][type_ref] + 1 ################# # folder LGD # ################# l_file_LGD = listdir(pr_LGD) for file_LGD in l_file_LGD : # print file_ref if search ("LGD", file_LGD): ligand = file_LGD.split ("_")[1] if ligand == "REF" : continue if not ligand in d_lig.keys () : d_lig[ligand] = {} d_lig[ligand]["count"] = 0 d_lig[ligand]["group"] = [] d_lig[ligand]["family"] = [] d_lig[ligand]["count"] = d_lig[ligand]["count"] + 1 d_lig[ligand]["family"].append (str(family_ref)) d_lig[ligand]["group"].append (str(group_ref)) ############### # folder BS # ############### l_file_BS = listdir(pr_BS) for file_BS in l_file_BS : if search ("BS_REF", file_BS): lig_ref = file_BS.split ("_")[2] pr_ref = file_BS.split ("_")[3].split (".")[0] print lig_ref, pr_ref, "*****" if not lig_ref in d_count_pr.keys () : d_count_pr[lig_ref] = {} d_count_pr[lig_ref]["pr ref"] = [] d_count_pr[lig_ref]["pr queries"] = [] d_count_pr[lig_ref]["lig queries"] = [] if not pr_ref in d_count_pr[lig_ref]["pr ref"] : d_count_pr[lig_ref]["pr ref"].append (pr_ref) try: family = analysis.findFamily (pr_ref, pathManage.dataset (lig_ref) + "family_PDB.txt") if not family in d_count_pr[lig_ref].keys () : d_count_pr[lig_ref][family] = 0 d_count_pr[lig_ref][family] = d_count_pr[lig_ref][family] + 1 except: pass # BS -> query for file_BS in l_file_BS : # for not reference BS if not search ("BS_REF", file_BS) : lig_querie = file_BS.split ("_")[1] prot_querie = file_BS.split ("_")[2][0:4] print prot_querie, lig_querie, "*******" # find ligand reference # lig ref define in previous step d_count_pr[lig_ref]["pr queries"].append (prot_querie) d_count_pr[lig_ref]["lig queries"].append (lig_querie) # write and plot # ################## pr_result = pathManage.generatePath(pr_final_folder + "counting/") for ligand_sub in d_count.keys () : p_filout = pr_result + ligand_sub filout = open (p_filout, "w") filout.write ("\t".join(d_count[ligand_sub].keys ()) + "\n") l_value = [str(x) for x in d_count[ligand_sub].values ()] filout.write ("\t".join(l_value) + "\n") filout.close () runOtherSoft.piePlot(p_filout) filout_lig = open (pr_result + "count_ligand", "w") filout_lig.write ("Ligand ID\tNumber of occurences in the dataset\tNumber of different clusters\tList of clusters\tList of protein families\n") for lig in d_lig.keys () : if d_lig[lig] > 1 : filout_lig.write (str (lig) + "\t" + str (d_lig[lig]["count"]) + "\t" + str(len (list (set(d_lig[lig]["group"])))) + "\t" + " ".join (d_lig[lig]["group"]) + "\t" + " ".join (d_lig[lig]["family"]) + "\n") filout_lig.close () filout_LSR_lig = open (pr_result + "CountByLigandRef", "w") for lig_ref in d_by_ref.keys () : filout_LSR_lig.write ("====" + str (lig_ref) + "====\n") for sub_ref in d_by_ref[lig_ref].keys () : filout_LSR_lig.write (str (sub_ref) + ": " + str (d_by_ref[lig_ref][sub_ref]) + "\n") filout_LSR_lig.close () filout_pr_count = open (pr_result + "count_pr", "w") for lig in d_count_pr.keys () : filout_pr_count.write ("====" + str (lig) + "====\n") filout_pr_count.write ("nb ref pr: " + str (len (d_count_pr[lig]["pr ref"])) + "\n") filout_pr_count.write ("nb querie pr: " + str (len (d_count_pr[lig]["pr queries"])) + "\n") filout_pr_count.write ("nb ligand queries: " + str (len (d_count_pr[lig]["lig queries"])) + "\n") for family in d_count_pr[lig].keys () : if family != "pr ref" and family != "pr queries" and family != "lig queries" : filout_pr_count.write ("Ref " + str (family) + ": " + str (d_count_pr[lig][family]) + "\n") filout_pr_count.close () runOtherSoft.barplot(pr_result + "count_ligand")
def globalArrangement (pr_orgin, p_smile, p_family, name_ligand, l_ligand_out): # print "--------" # print pr_orgin # print p_smile # print p_family # print name_ligand # print "--------" subst = p_smile.split ("_")[-3] filin = open (p_smile, "r") l_line_smile = filin.readlines () filin.close() for line_smile in l_line_smile : # search substructure # print line_smile l_PDB_query = line_smile.split ("\t")[-3].split (" ") # print l_PDB_query l_PDB_ref = line_smile.split ("\t")[-2].split (" ") l_ligand = line_smile.strip().split ("\t")[-1].split (" ") # search replacement smile = line_smile.split ("\t")[0] # search if LSR is small -> thresold < 3 small_LSR = smileAnalysis.smallLSR (smile) if subst == "ribose" : if small_LSR == 1 : first_folder = "ribose_small" else : first_folder = "ribose" else : if small_LSR == 1 : first_folder = "Pi_small" else : first_folder = "Pi" print smile, l_PDB_query, l_PDB_ref, l_ligand, subst, small_LSR replacement, metal = smileAnalysis.searchReplacement (smile, l_PDB_query[0], l_PDB_ref[0], name_ligand) # case with cycle -> search replacement 2 if replacement == "cycle" : replacement2, metal = smileAnalysis.searchReplacement (smile, l_PDB_query[0], l_PDB_ref[0], name_ligand, in_cycle = 1) replacement = replacement + "/" + replacement2 # new folder # case metal if replacement == "metal" : print metal, l_PDB_query, l_PDB_ref, name_ligand len_find = len (l_PDB_ref) i = 0 while i < len_find : # exclusion of ligand out if l_ligand[i] in l_ligand_out : i = i + 1 continue group, family = analysis.findFamilyAndGroup(l_PDB_ref[i]) # folder reference pr_dataset = pathManage.dataset(name_ligand + "/" + l_PDB_ref[i]) PDB_ref = pathManage.findPDBRef(pr_dataset) p_ligand_ref = pathManage.findligandRef(pr_dataset , name_ligand) l_frag_ref = pathManage.findSubstructRef(pr_dataset, name_ligand) for f_ref in l_frag_ref : if search (subst, f_ref) : p_frag_ref = f_ref break # folder_query pr_result = pathManage.result(name_ligand + "/" + l_PDB_ref[i]) l_protein_tranloc = pathManage.findPDBQueryTransloc(pr_result) for p_t in l_protein_tranloc : if search (l_ligand[i], p_t) and search (l_PDB_query[i], p_t) : p_protein_query = p_t break if replacement != "metal" : p_lig_query = pathManage.findligandQuery(pr_dataset , l_ligand[i], l_PDB_query[i]) else : p_lig_query = pathManage.findligandQuery(pr_dataset ,metal, l_PDB_query[i]) # need apply transloc matrix matrix_transloc = pathManage.findMatrix(p_ligand_ref, p_lig_query, name_ligand) lig_query_parsed = parsePDB.loadCoordSectionPDB(p_lig_query) try : superposeStructure.applyMatrixLigand(lig_query_parsed, matrix_transloc) except : i = i + 1 continue p_lig_substituate = pathManage.findSubstructFind(pr_result, l_ligand[i], l_PDB_query[i], subst) l_p_BS = pathManage.findFileBS(pr_result, l_PDB_query[i]) for BS in l_p_BS : if search (l_ligand[i], BS) : p_BS = BS break # print pr_final # print "***************" # print PDB_ref # print p_ligand_ref # print p_frag_ref # print "----" # print p_protein_query # print p_lig_query # print p_lig_substituate # print p_BS # print "**************" # ajouter group + family 2 lettre pr_final = pr_orgin + first_folder + "/" + replacement + "/" + str (family) + "-" + str (group) + "_" + l_PDB_ref[i] + "/" pr_ligand = pr_orgin + first_folder + "/" + replacement + "/" + str (family) + "-" + str (group) + "_" + l_PDB_ref[i] + "/LGD/" pr_BS = pr_orgin + first_folder + "/" + replacement + "/" + str (family) + "-" + str (group) + "_" + l_PDB_ref[i] + "/BS/" pr_sust = pr_orgin + first_folder + "/" + replacement + "/" + str (family) + "-" + str (group) + "_" + l_PDB_ref[i] + "/LSR/" if not path.isdir(pr_final): makedirs (pr_final) if not path.isdir(pr_ligand): makedirs (pr_ligand) if not path.isdir(pr_BS): makedirs (pr_BS) if not path.isdir(pr_sust): makedirs (pr_sust) # list file p_list_smile_queries = pr_sust + "list.smile" if not path.exists(p_list_smile_queries) : file_smile_queries = open (p_list_smile_queries, "w") else : file_smile_queries = open (p_list_smile_queries, "a") file_smile_queries.write (str(smile) + "\n") file_smile_queries.close () # lig de la query writePDBfile.coordinateSection(pr_ligand + "LGD_" + p_lig_query.split ("/")[-1], lig_query_parsed, recorder = "HETATM", header = "LCG_" + p_lig_query.split ("/")[-1], connect_matrix = 1) runOtherSoft.babelConvertPDBtoSMILE(pr_ligand + "LGD_" + p_lig_query.split ("/")[-1], clean_smi = 1) # lig de reference + smile copy2(p_ligand_ref, pr_ligand + "LGD_REF_" + p_ligand_ref.split ("/")[-1]) runOtherSoft.babelConvertPDBtoSMILE(pr_ligand + "LGD_REF_" + p_ligand_ref.split ("/")[-1]) # LSR de ref copy2(p_frag_ref, pr_sust + "LSR_REF_" + name_ligand + "_" + l_PDB_ref[i] + ".pdb") # protein query #copy2(p_protein_query, pr_final) # LSR query -> p_lig_ref only for the name copy2(p_lig_substituate, pr_sust + "LSR_" + subst + "_" + p_lig_query.split ("/")[-1]) # BS query copy2(p_BS, pr_BS) # BS from reference l_atom_BS = parsePDB.computeBS (PDB_ref, p_ligand_ref, thresold = 4.50, option_onlyATOM = 0) writePDBfile.coordinateSection(pr_BS + "BS_REF_" + name_ligand + "_" + PDB_ref.split ("/")[-1], l_atom_BS, recorder = "ATOM", header = "BS_REF_" + name_ligand + "_" + PDB_ref, connect_matrix = 0) i = i + 1 return 1
def main (name_database, max_distance = 5.0, RX = 3.00, RFree = 0.25, option_superimpose = 0, option_on_complexes_by_ligand = 0, option_bond = 0, option_stat = 0, option_stat_dataset = 0, option_merge = 0, verbose = 1): #format input max_distance = float (max_distance) # run one database pr_result = pathManage.result (name_database) # search ligand in PDB searchPDB.ligands(name_database, pr_result) # dataset with resolution l_p_dataset = datasetFinal.Builder(name_database, RX, RFree, option_on_complexes_by_ligand) # ######################## # # Parsing dataset # # ######################## # # if option_stat_dataset == 1 : for p_dataset in l_p_dataset : statistic.ParseDataSet(p_dataset) # #################### # # result directory # # #################### # # # run for every dataset -> with diffrent resolution # short cut # l_p_dataset = ["/home/borrel/saltBridgesProject/result/PDB/3.0_0.25_uniquePDB/dataset_3.00.txt" ] # # # for p_dataset in l_p_dataset : pr_result = pathManage.CreatePathDir(p_dataset[:-4] + "/") pr_hetion = pathManage.CreatePathDir(p_dataset[:-4] + "/HET/") if verbose == 1 : print "== control path Main ==" print pr_result print pr_hetion print "=======================" # # stat -> build structure, not filter is !!! d_sub_neighbor = searchPDB.globalSearch(max_distance, p_dataset, pr_result) # remove iron close -> statistic before # Becarful because the dictionnary change print "control-1", len(d_sub_neighbor["I"]) d_close_het = hetCloseAnalysis.removeNeighborIron (d_sub_neighbor, pr_hetion + "ionSummarySubstruct.txt") print "control-2", len(d_sub_neighbor["I"]) if option_superimpose == 1 : # superimpose neighbors -> refaire a Helsinki car MAJ de de la PDB superimpose.globalNeighbor (d_sub_neighbor, "I", pr_result) superimpose.globalNeighbor (d_sub_neighbor, "II", pr_result) superimpose.globalNeighbor (d_sub_neighbor, "III", pr_result) superimpose.globalNeighbor (d_sub_neighbor, "IMD", pr_result) superimpose.globalNeighbor (d_sub_neighbor, "GAI", pr_result) superimpose.globalNeighbor (d_sub_neighbor, "COO", pr_result) # superimpose neighbors -> with het first stabilization # superimpose.globalNeighbor (d_close_het, "I", pr_hetion) # superimpose.globalNeighbor (d_close_het, "II", pr_hetion) # superimpose.globalNeighbor (d_close_het, "III", pr_hetion) # superimpose.globalNeighbor (d_close_het, "IMD", pr_hetion) # superimpose.globalNeighbor (d_close_het, "GAI", pr_hetion) if option_bond == 1 : # check planarity imidazole + guanidium statistic.planarityImidazole (d_sub_neighbor, pr_result) statistic.planarityGuanidium (d_sub_neighbor, pr_result) statistic.lenBondAnalysis(d_sub_neighbor, "I", pr_result) statistic.lenBondAnalysis(d_sub_neighbor, "II", pr_result) statistic.lenBondAnalysis(d_sub_neighbor, "III", pr_result) if option_stat == 1: # statistic statistic.globalRunStatistic(d_sub_neighbor, max_distance, pr_result) # statistic.globalRunStatistic(d_close_het, max_distance, pr_hetion) if option_merge == 1: if option_on_complexes_by_ligand == 1: statistic.MergeDataSet(pathManage.result (name_database + "/" + str (RX) + "_" + str (RFree) + "_uniquePDB"), "dataset_1.50", "dataset_3.00") statistic.MergeDataSet(pathManage.result (name_database + "/" + str (RX) + "_" + str (RFree) + "_uniquePDB"), "dataset_1.50", "dataset_3.00", arom = 1) else : statistic.MergeDataSet(pathManage.result (name_database + "/" + str (RX) + "_" + str (RFree)), "dataset_1.50", "dataset_3.00")
def analysisBS (name_lig, ID_seq = '0.0', debug = 1): pr_result = pathManage.result(name_lig) pr_out = pathManage.result(name_lig + "/sameBS") # log files p_log_file = pr_out + "log.txt" filout_log = open (p_log_file, "w") # dictionnar with files d_file_BS = {} d_file_BS["global"] = open (pr_out + name_lig + "_", "w") d_file_BS["global"].write ("name_bs\tRMSD_prot\tRMSD_BS_ca\tRMSD_BS_all\tD_max\tl_at_BS\tidentic\n") d_file_BS["summary"] = open (pr_out + "summary.txt", "w") pr_dataset = pathManage.dataset(name_lig) l_folder_ref = listdir(pr_result) nb_BS = 0 nb_BS_filtered = 0 nb_same_BS = 0 for PDB_ref in l_folder_ref : if debug : print PDB_ref if len (PDB_ref) != 4 : continue p_pdb_ref = pathManage.findPDBRef(pr_dataset + PDB_ref + "/") l_p_query = pathManage.findPDBQueryTransloc (pathManage.result(name_lig) + PDB_ref + "/") if debug : print l_p_query for p_query in l_p_query : # read TM Align if debug : print p_query.split ("/")[-1][7:-4] p_TMalign = pathManage.alignmentOutput(name_lig) + p_pdb_ref.split ("/")[-1][0:-4] + "__" + p_query.split ("/")[-1][7:-4] + "/RMSD" try : score_align = parseTMalign.parseOutputTMalign(p_TMalign) except : filout_log.write ("ERROR TM align " + p_TMalign + "\n") continue nb_BS = nb_BS + 1 if score_align["IDseq"] >= ID_seq : nb_BS_filtered = nb_BS_filtered + 1 l_p_substruct_ref = pathManage.findSubstructRef (pr_dataset + PDB_ref + "/", name_lig) # sub BS for p_substruct_ref in l_p_substruct_ref : struct_substitued = p_substruct_ref.split ("_")[-2] # write header if not struct_substitued in d_file_BS.keys () : d_file_BS[struct_substitued] = open (pr_out + name_lig + "_" + struct_substitued + "_", "w") d_file_BS[struct_substitued].write ("name_bs\tRMSD_prot\tRMSD_BS_ca\tRMSD_BS_all\tD_max\tl_at_BS\tidentic\n") RMSD_bs = analysis.computeRMSDBS (p_pdb_ref, p_query, p_substruct_ref, pr_out) if RMSD_bs != [] : d_file_BS[struct_substitued].write (p_substruct_ref.split("/")[-1][0:-4] + "_*_" + p_query.split ("/")[-1][0:-4] + "\t" + str(score_align["RMSD"]) + "\t" + str(RMSD_bs[1]) + "\t" + str(RMSD_bs[0]) + "\t" + str(RMSD_bs[2]) + "\t" + str(RMSD_bs[-2]) + "\t" + str(RMSD_bs[-1]) + "\n") p_ligand_ref = pathManage.findligandRef(pr_dataset + PDB_ref + "/", name_lig) RMSD_bs_lig = analysis.computeRMSDBS (p_pdb_ref, p_query, p_ligand_ref, pr_out) if RMSD_bs_lig != [] : d_file_BS["global"].write (p_ligand_ref.split("/")[-1][0:-4] + "_*_" + p_query.split ("/")[-1][0:-4] + "\t" + str(score_align["RMSD"]) + "\t" + str(RMSD_bs_lig[1]) + "\t" + str(RMSD_bs_lig[0]) + "\t" + str(RMSD_bs_lig[2]) + "\t" + str(RMSD_bs_lig[-2]) + "\t" + str(RMSD_bs_lig[-1]) + "\n") if RMSD_bs_lig [-1] == 1 : nb_same_BS = nb_same_BS + 1 # write summary d_file_BS["summary"].write ("BS global: " + str (nb_BS) + "\n") d_file_BS["summary"].write ("BS - IDseq " + str (ID_seq) + "%: " + str (nb_BS_filtered) + "\n") d_file_BS["summary"].write ("BS - same atom number: " + str (nb_same_BS) + "\n") filout_log.close () # close files and run histograms for k_dico in d_file_BS.keys () : p_file = d_file_BS[k_dico].name d_file_BS[k_dico].close () if name_lig == "ATP" : runOtherSoft.RhistogramRMSD(p_file, max_RMSD = 5.0) elif name_lig == "ADP" : runOtherSoft.RhistogramRMSD(p_file, max_RMSD = 4.0) elif name_lig == "AMP" : runOtherSoft.RhistogramRMSD(p_file, max_RMSD = 4.0) else : runOtherSoft.RhistogramRMSD(p_file, max_RMSD = 3.5) return 1
else : i = i + 1 if len (l_ref) != 0 : filout_smile.write (str(l_elem1[0]) + "\t" + str (len (l_queries)) + "\t" + " ".join(l_queries) + "\t" + " ".join (l_ref) + "\t" + " ".join (l_lig) + "\n") filout_smile.close () #################### ### MAIN ##### #################### # constante thresold_RX = 2.7 thresold_BS = 4.5 thresold_blast = 1e-100 thresold_superimposed_ribose = 2.5 thresold_superimposed_pi = 3 thresold_IDseq = 100 thresold_shaep = 0.2 l_ligand_out = ["AMP", "ADP", "ATP", "TTP", "DCP", "DGT", "DTP", "DUP", "ACP", "AD9", "NAD", "AGS", "UDP", "POP", "APC", "CTP", "AOV"] # main # ######## pr_result = pathManage.result() #cleanResultFolder (thresold_shaep, l_ligand_out, pr_result) cleanSmileFile (thresold_shaep, l_ligand_out, pr_result)
def SheapScoreToClass(prclassif): nameREF = prclassif.split("/")[-1] prout = pathManage.result(nameREF + "_SheapClassif") pfilout = prout + "PiLSRType" filout = open(pfilout, "w") filout.write("ClassLSR\tESP\tshape\tName\n") lprref = [] lfoldergroups = listdir(prclassif) for foldergroup in lfoldergroups: if foldergroup == "cycle": lsubtypes = listdir(prclassif + "/cycle/") for subtype in lsubtypes: lrefprot = listdir(prclassif + "/cycle/" + subtype) for refprot in lrefprot: lprref.append(prclassif + "/cycle/" + subtype + "/" + refprot) else: lrefprot = listdir(prclassif + "/" + foldergroup + "/") for refprot in lrefprot: lprref.append(prclassif + "/" + foldergroup + "/" + refprot) for reffolder in lprref: #print reffolder classcycle = reffolder.split("/")[-3] if classcycle == "cycle": classif = classcycle + "-" + reffolder.split("/")[-2] else: classif = reffolder.split("/")[-2] # PDB reference PDBref = reffolder.split("/")[-1] PDBref = PDBref.split("_")[-1] lLSR = listdir(reffolder + "/LSR") lgdREF = "" for fileLSR in lLSR: if search("LSR_REF", fileLSR): lgdREF = fileLSR.split("_")[2] break if lgdREF == "": print "Error reference l.49 classifResult.py" for fileLSR in lLSR: if search(".pdb", fileLSR): lelemsplit = fileLSR.split("_") typeLSR = lelemsplit[1] if typeLSR == "REF": continue lgd = lelemsplit[2] PDBLSR = lelemsplit[3] #print classif, PDBref, typeLSR, lgd, PDBLSR # file sheap in result folder psheap = pathManage.result( ) + lgdREF + "/" + PDBref + "/substituent_" + lgd + "_" + PDBLSR + "_" + typeLSR + ".hit" #print psheap if not path.exists(psheap): continue dsheap = parseShaep.parseOutputShaep(psheap) filout.write(classif + "\t" + str(dsheap["ESP_similarity"]) + "\t" + str(dsheap["shape_similarity"]) + "\t" + lgd + "_" + PDBLSR + "_" + typeLSR + "\n") filout.close() # plot R to do runOtherSoft.plotClassifSheap(pfilout)
def analysisBS(name_lig, ID_seq='0.0', debug=1): pr_result = pathManage.result(name_lig) pr_out = pathManage.result(name_lig + "/sameBS") # log files p_log_file = pr_out + "log.txt" filout_log = open(p_log_file, "w") # dictionnar with files d_file_BS = {} d_file_BS["global"] = open(pr_out + name_lig + "_", "w") d_file_BS["global"].write( "name_bs\tRMSD_prot\tRMSD_BS_ca\tRMSD_BS_all\tD_max\tl_at_BS\tidentic\n" ) d_file_BS["summary"] = open(pr_out + "summary.txt", "w") pr_dataset = pathManage.dataset(name_lig) l_folder_ref = listdir(pr_result) nb_BS = 0 nb_BS_filtered = 0 nb_same_BS = 0 for PDB_ref in l_folder_ref: if debug: print PDB_ref if len(PDB_ref) != 4: continue p_pdb_ref = pathManage.findPDBRef(pr_dataset + PDB_ref + "/") l_p_query = pathManage.findPDBQueryTransloc( pathManage.result(name_lig) + PDB_ref + "/") if debug: print l_p_query for p_query in l_p_query: # read TM Align if debug: print p_query.split("/")[-1][7:-4] p_TMalign = pathManage.alignmentOutput(name_lig) + p_pdb_ref.split( "/")[-1][0:-4] + "__" + p_query.split("/")[-1][7:-4] + "/RMSD" try: score_align = parseTMalign.parseOutputTMalign(p_TMalign) except: filout_log.write("ERROR TM align " + p_TMalign + "\n") continue nb_BS = nb_BS + 1 if score_align["IDseq"] >= ID_seq: nb_BS_filtered = nb_BS_filtered + 1 l_p_substruct_ref = pathManage.findSubstructRef( pr_dataset + PDB_ref + "/", name_lig) # sub BS for p_substruct_ref in l_p_substruct_ref: struct_substitued = p_substruct_ref.split("_")[-2] # write header if not struct_substitued in d_file_BS.keys(): d_file_BS[struct_substitued] = open( pr_out + name_lig + "_" + struct_substitued + "_", "w") d_file_BS[struct_substitued].write( "name_bs\tRMSD_prot\tRMSD_BS_ca\tRMSD_BS_all\tD_max\tl_at_BS\tidentic\n" ) RMSD_bs = analysis.computeRMSDBS(p_pdb_ref, p_query, p_substruct_ref, pr_out) if RMSD_bs != []: d_file_BS[struct_substitued].write( p_substruct_ref.split("/")[-1][0:-4] + "_*_" + p_query.split("/")[-1][0:-4] + "\t" + str(score_align["RMSD"]) + "\t" + str(RMSD_bs[1]) + "\t" + str(RMSD_bs[0]) + "\t" + str(RMSD_bs[2]) + "\t" + str(RMSD_bs[-2]) + "\t" + str(RMSD_bs[-1]) + "\n") p_ligand_ref = pathManage.findligandRef( pr_dataset + PDB_ref + "/", name_lig) RMSD_bs_lig = analysis.computeRMSDBS(p_pdb_ref, p_query, p_ligand_ref, pr_out) if RMSD_bs_lig != []: d_file_BS["global"].write( p_ligand_ref.split("/")[-1][0:-4] + "_*_" + p_query.split("/")[-1][0:-4] + "\t" + str(score_align["RMSD"]) + "\t" + str(RMSD_bs_lig[1]) + "\t" + str(RMSD_bs_lig[0]) + "\t" + str(RMSD_bs_lig[2]) + "\t" + str(RMSD_bs_lig[-2]) + "\t" + str(RMSD_bs_lig[-1]) + "\n") if RMSD_bs_lig[-1] == 1: nb_same_BS = nb_same_BS + 1 # write summary d_file_BS["summary"].write("BS global: " + str(nb_BS) + "\n") d_file_BS["summary"].write("BS - IDseq " + str(ID_seq) + "%: " + str(nb_BS_filtered) + "\n") d_file_BS["summary"].write("BS - same atom number: " + str(nb_same_BS) + "\n") filout_log.close() # close files and run histograms for k_dico in d_file_BS.keys(): p_file = d_file_BS[k_dico].name d_file_BS[k_dico].close() if name_lig == "ATP": runOtherSoft.RhistogramRMSD(p_file, max_RMSD=5.0) elif name_lig == "ADP": runOtherSoft.RhistogramRMSD(p_file, max_RMSD=4.0) elif name_lig == "AMP": runOtherSoft.RhistogramRMSD(p_file, max_RMSD=4.0) else: runOtherSoft.RhistogramRMSD(p_file, max_RMSD=3.5) return 1
if len(l_ref) != 0: filout_smile.write( str(l_elem1[0]) + "\t" + str(len(l_queries)) + "\t" + " ".join(l_queries) + "\t" + " ".join(l_ref) + "\t" + " ".join(l_lig) + "\n") filout_smile.close() #################### ### MAIN ##### #################### # constante thresold_RX = 2.7 thresold_BS = 4.5 thresold_blast = 1e-100 thresold_superimposed_ribose = 2.5 thresold_superimposed_pi = 3 thresold_IDseq = 100 thresold_shaep = 0.2 l_ligand_out = [ "AMP", "ADP", "ATP", "TTP", "DCP", "DGT", "DTP", "DUP", "ACP", "AD9", "NAD", "AGS", "UDP", "POP", "APC", "CTP", "AOV" ] # main # ######## pr_result = pathManage.result() #cleanResultFolder (thresold_shaep, l_ligand_out, pr_result) cleanSmileFile(thresold_shaep, l_ligand_out, pr_result)
def SheapScoreToClass(prclassif): nameREF = prclassif.split("/")[-1] prout = pathManage.result(nameREF + "_SheapClassif") pfilout = prout + "PiLSRType" filout = open(pfilout, "w") filout.write("ClassLSR\tESP\tshape\tName\n") lprref = [] lfoldergroups = listdir(prclassif) for foldergroup in lfoldergroups: if foldergroup == "cycle": lsubtypes = listdir(prclassif + "/cycle/") for subtype in lsubtypes: lrefprot = listdir(prclassif + "/cycle/" + subtype) for refprot in lrefprot: lprref.append(prclassif + "/cycle/" + subtype + "/" + refprot) else: lrefprot = listdir(prclassif + "/" + foldergroup + "/") for refprot in lrefprot: lprref.append(prclassif + "/" + foldergroup + "/" + refprot) for reffolder in lprref: #print reffolder classcycle = reffolder.split("/")[-3] if classcycle == "cycle": classif = classcycle + "-" + reffolder.split("/")[-2] else: classif = reffolder.split("/")[-2] # PDB reference PDBref = reffolder.split("/")[-1] PDBref = PDBref.split("_")[-1] lLSR = listdir(reffolder + "/LSR") lgdREF = "" for fileLSR in lLSR: if search("LSR_REF", fileLSR): lgdREF = fileLSR.split("_")[2] break if lgdREF == "": print "Error reference l.49 classifResult.py" for fileLSR in lLSR: if search(".pdb", fileLSR): lelemsplit = fileLSR.split("_") typeLSR = lelemsplit[1] if typeLSR == "REF": continue lgd = lelemsplit[2] PDBLSR = lelemsplit[3] #print classif, PDBref, typeLSR, lgd, PDBLSR # file sheap in result folder psheap = pathManage.result() + lgdREF + "/" + PDBref + "/substituent_" + lgd + "_" + PDBLSR + "_" + typeLSR + ".hit" #print psheap if not path.exists(psheap): continue dsheap = parseShaep.parseOutputShaep(psheap) filout.write(classif + "\t" + str(dsheap["ESP_similarity"]) + "\t" + str(dsheap["shape_similarity"]) + "\t" + lgd + "_" + PDBLSR + "_" + typeLSR + "\n") filout.close()
# ####################### # CLASSIFICATION LSRs # ####################### name_folder_final = "withoutLig" # manageResult (["AMP", "ADP", "POP", "ATP"], name_folder_final, l_ligand_out) # arrangeResult.qualityExtraction (["AMP", "ADP", "POP", "ATP"], name_folder_final, p_list_ligand = "/home/borrel/Yue_project/resultLigandInPDB", thresold_sheap = thresold_shaep) #arrangeResult.countingSubstituent(name_folder_final) ################################################### # AFFINITY AND INTERACTIONS BY PROTEIN REFERENCE # ################################################### # folder final pr_classif = pathManage.result("final_" + name_folder_final) + "Pi_LSR" #ligandSimilarity.analyseLGDProximity(pr_classif) ######################################### # ANALYSE CLASSIFICATION BASED ON SHEAP # ######################################### classifResults.SheapScoreToClass(pr_classif) ###################### # ANALYSE REFERENCE # ###################### # analyse enantiomer # arrangeResult.enantiomer(["AMP", "ADP", "ATP"], name_folder_final) # analyse the superimposition of ligand references
def retrieveSubstructSuperimposed(name_lig, thresold_BS=4.5, thresold_superimposed_ribose=2.5, thresold_superimposed_pi=3, thresold_shaep=0.4): # ouput p_dir_dataset = pathManage.dataset(name_lig) p_dir_result = pathManage.result(name_lig) l_folder_ref = listdir(p_dir_dataset) # log control p_log = open(p_dir_result + "log_superimposed.txt", "w") # control extraction d_control = {} d_control["pr ref"] = 0 d_control["lig query"] = 0 d_control["subref"] = {} d_control["subref empty"] = {} d_control["out sheap"] = {} filout_control = open(p_dir_result + "quality_extraction.txt", "w") # stock smile code d_smile = {} # sheap control d_filout_sheap = {} d_filout_sheap["list"] = [p_dir_result + "shaep_global.txt"] d_filout_sheap["global"] = open(p_dir_result + "shaep_global.txt", "w") d_filout_sheap["global"].write( "name\tbest_similarity\tshape_similarity\tESP_similarity\n") for ref_folder in l_folder_ref: # control folder reference name if len(ref_folder) != 4: p_log.write("[ERROR folder] -> " + ref_folder + "\n") continue # reference p_lig_ref = pathManage.findligandRef(p_dir_dataset + ref_folder + "/", name_lig) try: lig_ref_parsed = parsePDB.loadCoordSectionPDB(p_lig_ref, "HETATM") # print len (lig_ref_parsed) except: p_log.write("[ERROR ligand ref] -> " + p_lig_ref + "\n") continue #control d_control["pr ref"] = d_control["pr ref"] + 1 # output by reference p_dir_result_ref = pathManage.result(name_lig + "/" + ref_folder) d_filout_superimposed = {} d_filout_superimposed["global"] = open( p_dir_result_ref + "all_ligand_aligned.pdb", "w") d_filout_superimposed["sheap"] = open( p_dir_result_ref + "all_ligand_aligned_" + str(thresold_shaep) + ".pdb", "w") # write lig ref -> connect matrix corrrect in all reference and all sheap writePDBfile.coordinateSection(d_filout_superimposed["global"], lig_ref_parsed, "HETATM", connect_matrix=1) writePDBfile.coordinateSection(d_filout_superimposed["sheap"], lig_ref_parsed, "HETATM", connect_matrix=1) # inspect folder dataset l_pdbfile = listdir(p_dir_dataset + ref_folder + "/") for pdbfile in l_pdbfile: # no ligand file if len(pdbfile.split("_")) == 1: continue pdbfile = pdbfile[:-4] # remove extention if len(pdbfile.split("_")[0]) == 3 and len(pdbfile.split( "_")[1]) == 4 and pdbfile.split("_")[1] != ref_folder: p_lig = p_dir_dataset + ref_folder + "/" + pdbfile + ".pdb" if p_lig_ref != p_lig: # pass case where ligand replace same ligand -> does not need run if pdbfile.split("_")[0] == name_lig: p_log.write("[REMOVE] -> same ligand substituate") continue # parsed ligand query lig_parsed = parsePDB.loadCoordSectionPDB(p_lig, "HETATM") # find matrix of rotation p_matrix = pathManage.findMatrix(p_lig_ref, p_lig, name_lig) # control file matrix exist if not path.exists(p_matrix): p_log.write("[ERROR] -> Matrix transloc " + p_lig_ref + " " + p_lig + " " + name_lig + "\n") continue # control d_control["lig query"] = d_control["lig query"] + 1 # find the path of complex used p_complex = p_dir_dataset + ref_folder + "/" + p_lig.split( "/")[-1][4:] # ligand rotated -> change the referentiel superposeStructure.applyMatrixLigand(lig_parsed, p_matrix) # use substruct l_p_substruct_ref = pathManage.findSubstructRef( pathManage.dataset(name_lig) + ref_folder + "/", name_lig) for p_substruct_ref in l_p_substruct_ref: # ribose or phosphate struct_type = p_substruct_ref.split("_")[-2] substruct_parsed = parsePDB.loadCoordSectionPDB( p_substruct_ref, "HETATM") l_atom_substituate = neighborSearch.searchNeighborAtom( substruct_parsed, lig_parsed, struct_type, p_log, thresold_superimposed_ribose= thresold_superimposed_ribose, thresold_superimposed_pi=thresold_superimposed_pi) # control find if len(l_atom_substituate) == 0: if not struct_type in d_control[ "subref empty"].keys(): d_control["subref empty"][struct_type] = 1 else: d_control["subref empty"][ struct_type] = d_control["subref empty"][ struct_type] + 1 continue else: if not struct_type in d_control["subref"].keys(): d_control["subref"][struct_type] = 1 else: d_control["subref"][struct_type] = d_control[ "subref"][struct_type] + 1 # write PDB file, convert smile p_substituate_pdb = p_dir_result_ref + "substituent_" + pdbfile.split( "_")[0] + "_" + pdbfile.split( "_")[1] + "_" + struct_type + ".pdb" writePDBfile.coordinateSection(p_substituate_pdb, l_atom_substituate, recorder="HETATM", header=0, connect_matrix=1) # sheap reference on part of ligand p_sheap = runOtherSoft.runShaep( p_substruct_ref, p_substituate_pdb, p_substituate_pdb[0:-4] + ".hit", clean=0) val_sheap = parseShaep.parseOutputShaep(p_sheap) if val_sheap == {}: p_log.write("[ERROR] -> ShaEP " + p_substituate_pdb + " " + p_substruct_ref + "\n") if not struct_type in d_control[ "out sheap"].keys(): d_control["out sheap"][struct_type] = 1 else: d_control["out sheap"][ struct_type] = d_control["out sheap"][ struct_type] + 1 continue # control thresold sheap if not struct_type in d_filout_sheap.keys(): d_filout_sheap[struct_type] = {} d_filout_sheap[struct_type] = open( p_dir_result + "shaep_global_" + struct_type + ".txt", "w") d_filout_sheap[struct_type].write( "name\tbest_similarity\tshape_similarity\tESP_similarity\n" ) d_filout_sheap["list"].append( p_dir_result + "shaep_global_" + struct_type + ".txt") # to improve with python function # write value in ShaEP control d_filout_sheap[struct_type].write( ref_folder + "_" + str(pdbfile.split("_")[1]) + "_" + struct_type + "_" + str(pdbfile.split("_")[0]) + "\t" + str(val_sheap["best_similarity"]) + "\t" + str(val_sheap["shape_similarity"]) + "\t" + str(val_sheap["ESP_similarity"]) + "\n") d_filout_sheap["global"].write( ref_folder + "_" + str(pdbfile.split("_")[1]) + "_" + struct_type + "_" + str(pdbfile.split("_")[0]) + "\t" + str(val_sheap["best_similarity"]) + "\t" + str(val_sheap["shape_similarity"]) + "\t" + str(val_sheap["ESP_similarity"]) + "\n") # rename file substituent with shaEP value rename( p_substituate_pdb, p_substituate_pdb[:-4] + "_" + str(val_sheap["best_similarity"]) + ".pdb") # rename and change the file name p_substituate_pdb = p_substituate_pdb[:-4] + "_" + str( val_sheap["best_similarity"]) + ".pdb" # write all substruct in global file writePDBfile.coordinateSection( d_filout_superimposed["global"], lig_parsed, recorder="HETATM", header=str(p_lig.split("/")[-1]) + "_" + str(val_sheap["best_similarity"]), connect_matrix=1) # control sheap thresold if float(val_sheap["best_similarity"] ) >= thresold_shaep: # write subligand superimposed selected in global files writePDBfile.coordinateSection( d_filout_superimposed["sheap"], lig_parsed, recorder="HETATM", header=str(p_lig.split("/")[-1]) + "_" + str(val_sheap["best_similarity"]), connect_matrix=1) ############ # write BS # ############ # not only protein superimposed -> also ion and water l_atom_complex = parsePDB.loadCoordSectionPDB( p_complex) superposeStructure.applyMatrixProt( l_atom_complex, p_matrix) p_file_cx = p_dir_result_ref + "CX_" + p_lig.split( "/")[-1] # write CX writePDBfile.coordinateSection( p_file_cx, l_atom_complex, recorder="ATOM", header=p_lig.split("/")[-1], connect_matrix=0) # search atom in BS l_atom_binding_site = [] for atom_complex in l_atom_complex: for atom_substruct in lig_parsed: if parsePDB.distanceTwoatoms( atom_substruct, atom_complex) <= thresold_BS: if not atom_complex in l_atom_binding_site: l_atom_binding_site.append( deepcopy(atom_complex)) # 3. retrieve complet residue l_atom_BS_res = parsePDB.getResidues( l_atom_binding_site, l_atom_complex) # 4. write binding site p_binding = p_dir_result_ref + "BS_" + p_lig.split( "/")[-1] writePDBfile.coordinateSection( p_binding, l_atom_BS_res, "ATOM", p_binding, connect_matrix=0) # smile code substituate analysis # Step smile -> not conversion if shaep not validate smile_find = runOtherSoft.babelConvertPDBtoSMILE( p_substituate_pdb) if not struct_type in d_smile.keys(): d_smile[struct_type] = {} d_smile[struct_type][smile_find] = {} d_smile[struct_type][smile_find][ "count"] = 1 d_smile[struct_type][smile_find]["PDB"] = [ pdbfile.split("_")[1] ] d_smile[struct_type][smile_find][ "ligand"] = [pdbfile.split("_")[0]] d_smile[struct_type][smile_find]["ref"] = [ ref_folder ] else: if not smile_find in d_smile[ struct_type].keys(): d_smile[struct_type][smile_find] = {} d_smile[struct_type][smile_find][ "count"] = 1 d_smile[struct_type][smile_find][ "PDB"] = [pdbfile.split("_")[1]] d_smile[struct_type][smile_find][ "ligand"] = [ pdbfile.split("_")[0] ] d_smile[struct_type][smile_find][ "ref"] = [ref_folder] else: d_smile[struct_type][smile_find][ "count"] = d_smile[struct_type][ smile_find]["count"] + 1 d_smile[struct_type][smile_find][ "PDB"].append( pdbfile.split("_")[1]) d_smile[struct_type][smile_find][ "ligand"].append( pdbfile.split("_")[0]) d_smile[struct_type][smile_find][ "ref"].append(ref_folder) else: if not struct_type in d_control[ "out sheap"].keys(): d_control["out sheap"][struct_type] = 1 else: d_control["out sheap"][ struct_type] = d_control["out sheap"][ struct_type] + 1 tool.closeDicoFile(d_filout_superimposed) # sheap control tool.closeDicoFile(d_filout_sheap) for p_file_sheap in d_filout_sheap["list"]: runOtherSoft.RhistogramMultiple(p_file_sheap) # write list of smile for substruct in d_smile.keys(): p_list_smile = pathManage.result( name_lig) + "list_" + substruct + "_" + str( thresold_shaep) + "_smile.txt" filout_smile = open(p_list_smile, "w") for smile_code in d_smile[substruct].keys(): l_lig = d_smile[substruct][smile_code]["ligand"] l_PDB = d_smile[substruct][smile_code]["PDB"] l_ref = d_smile[substruct][smile_code]["ref"] filout_smile.write( str(smile_code) + "\t" + str(d_smile[substruct][smile_code]["count"]) + "\t" + " ".join(l_PDB) + "\t" + " ".join(l_ref) + "\t" + " ".join(l_lig) + "\n") filout_smile.close() p_log.close() # control filout_control.write("NB ref: " + str(d_control["pr ref"]) + "\n") filout_control.write("Ligand query: " + str(d_control["lig query"]) + "\n") for k in d_control["subref"].keys(): filout_control.write("LSR " + str(k) + ": " + str(d_control["subref"][k]) + "\n") for k in d_control["subref empty"].keys(): filout_control.write("NB LSR empty " + str(k) + ": " + str(d_control["subref empty"][k]) + "\n") for k in d_control["out sheap"].keys(): filout_control.write("LSR out by sheap " + str(k) + ": " + str(d_control["out sheap"][k]) + "\n") filout_control.write("**********************\n\n") for k in d_control["subref"].keys(): filout_control.write("LSR keep" + str(k) + ": " + str(d_control["subref"][k] - d_control["out sheap"][k]) + "\n") filout_control.close() return 1
def Builder(name_database, RX = 3.00, RFree = 0.25, one_PDB_by_lig = 0, debug = 1): """ Dataset Builder in : - open file result of filter ligand PDB out : - log file - dataset file -> ligand with associated PDB """ if one_PDB_by_lig == 0 : name_dataset = name_database + "/" + str (RX) + "_" + str (RFree) + "_multiPDB" else : name_dataset = name_database + "/" + str (RX) + "_" + str (RFree) + "_uniquePDB" pr_database = pathManage.result(name_database) pr_result = pathManage.result(name_dataset) if debug : print "== Path result " + pr_result + "==\n" # check dataSet exist !!!!!! # short cut l_file_dataset = pathManage.retriveDataSetFile (pr_result) if len(l_file_dataset) != 0 : return l_file_dataset # load structure d_lig_PDB = loadFile.LigandInPDB(pr_database + "resultLigandInPDB") nb_lig = len(d_lig_PDB.keys()) print "NB ligand included database:", nb_lig # print d_lig_PDB.keys().index("HSO") -> search index ligand i = 0 while i < nb_lig: name_lig = d_lig_PDB.keys()[i] ####################################### # step 1 search chemical substructure # ####################################### PDB_ref = d_lig_PDB[name_lig][0] if debug : print PDB_ref, name_lig, i, nb_lig # if not possible to load the ligand -> remove lig try : l_atom_lig_ref = loadFile.ligandInPDBConnectMatrixLigand(PDB_ref, name_lig) except : if debug == 1 : print "Exit => load ligand-l59" del d_lig_PDB[name_lig] nb_lig = nb_lig - 1 continue # search substructure interest l_interest_sub = searchPDB.interestStructure(l_atom_lig_ref) # search interest structure if debug : print "Interest substructures in " + str(name_lig) + "-" + str (PDB_ref) + " " + "-".join(l_interest_sub) if l_interest_sub == []: if debug == 1 : print "Exit => Not substructure-l68" del d_lig_PDB[name_lig] nb_lig = nb_lig - 1 continue ####################################################### # Step 2 Control quality of PDB + ligand hooked + option one # ####################################################### else : # control dataset quality if debug : print "List PDB checked -> ", d_lig_PDB[name_lig] l_PDB = checkPDBfile.CheckComplexQuality(d_lig_PDB[name_lig], name_lig, RX, RFree, one_PDB_by_lig) # remove the entrance key with the ligand if l_PDB == []: if debug == 1 : print "Exit => Not No PDB selected-l82" del d_lig_PDB[name_lig] nb_lig = nb_lig - 1 continue else : d_lig_PDB[name_lig] = l_PDB i = i + 1 if debug == 1 : print "Number of ligand selected =>", nb_lig # structure and file dataset and control RX + length bond WriteDataset (d_lig_PDB, pr_result) return Builder(name_database, RX , RFree , one_PDB_by_lig , debug = 1)