Exemplo n.º 1
0
def countingSubstituent (name_final, debug = 1):
    
    pr_final_folder = pathManage.result("final_" + name_final)
    
    d_count = {}
    d_lig = {}
    d_by_ref = {}
    d_count_pr = {}
    l_file_final = listdir(pr_final_folder)
    if debug : print "1", pr_final_folder
    for pr_type_subref in l_file_final :
        # case where pr type is a file not a folder
        try : l_pr_sub = listdir(pr_final_folder + pr_type_subref + "/")
        except : continue
        if debug: print "2",pr_final_folder +  pr_type_subref + "/"
        
        # case cycle append one directory
        if "cycle" in l_pr_sub : 
            l_pr_sub.remove ("cycle")
            l_second_sub = listdir (pr_final_folder + pr_type_subref + "/cycle/")
        
            for second_sub in l_second_sub : 
                l_pr_sub.append ("cycle/" + second_sub)


        for pr_sub in l_pr_sub : 
            # case where pr_type_substituent is a folder
            try : l_pr_PDBref = listdir(pr_final_folder + pr_type_subref + "/" + pr_sub + "/")
            except : continue
            if debug : print "3", pr_final_folder + pr_type_subref, pr_sub             

            for pr_PDBref in l_pr_PDBref :
                PDB_ref = pr_PDBref.split ("_")[-1]
                family_ref = pr_PDBref.split ("-")[0]
                group_ref = pr_PDBref.split ("_")[0].split ("-")[-1]
                pr_LGD = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LGD/"
                pr_LSR = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/LSR/"
                pr_BS = pr_final_folder + pr_type_subref + "/" + pr_sub + "/" + pr_PDBref + "/BS/"
                if debug : 
                    print "4",pr_LGD
                    print "4", pr_BS
                    print "4", pr_LSR




                ################
                #  folder LSR  #
                ################
                l_file_LSR = listdir (pr_LSR)

                for file_LSR in l_file_LSR :
                    # -> count by type sub reference
                    if search ("LSR_", file_LSR) and file_LSR.split ("_")[1] != "REF" :
                        ligand_sub = file_LSR.split ("_")[1]
                        if debug : print "5", file_LSR
                        if not ligand_sub in d_count.keys () : 
                            d_count[ligand_sub] = {}
                    
                        if not pr_sub in d_count[ligand_sub].keys () : 
                            d_count[ligand_sub][pr_sub] = 0
                        d_count[ligand_sub][pr_sub] = d_count[ligand_sub][pr_sub] + 1
                    
                    ################
                    # complet LSR  #
                    ################
                    elif search ("LSR", file_LSR):
                        # case LSR reference #
                        ######################
                        if search ("REF_", file_LSR) :
                            lig_ref = file_LSR.split ("_")[2][:3]
                            if not lig_ref in d_by_ref.keys () : 
                                d_by_ref[lig_ref] = {}

                            type_ref = pr_type_subref.split ("_")[0]

                            if not type_ref in d_by_ref[lig_ref].keys () : 
                                    d_by_ref[lig_ref][type_ref] = 0
                            
                            d_by_ref[lig_ref][type_ref] = d_by_ref[lig_ref][type_ref] + 1
            
            
                #################    
                #  folder LGD   #
                #################
                l_file_LGD = listdir(pr_LGD)
                for file_LGD in l_file_LGD : 
                    # print file_ref
                    if search ("LGD", file_LGD):
                        ligand = file_LGD.split ("_")[1]
                        if ligand == "REF" : 
                            continue
                        if not ligand in d_lig.keys () : 
                            d_lig[ligand] = {}
                            d_lig[ligand]["count"] = 0
                            d_lig[ligand]["group"] = []
                            d_lig[ligand]["family"] = []
                        d_lig[ligand]["count"] = d_lig[ligand]["count"] + 1
                        d_lig[ligand]["family"].append (str(family_ref))
                        d_lig[ligand]["group"].append (str(group_ref))

            
                ###############
                #  folder BS  #
                ###############
                l_file_BS = listdir(pr_BS)
                for file_BS in l_file_BS : 
                    if search ("BS_REF", file_BS):
                        lig_ref = file_BS.split ("_")[2]
                        pr_ref = file_BS.split ("_")[3].split (".")[0]
                        print lig_ref, pr_ref, "*****"
                        if not lig_ref in d_count_pr.keys () : 
                            d_count_pr[lig_ref] = {}
                            d_count_pr[lig_ref]["pr ref"] = []
                            d_count_pr[lig_ref]["pr queries"] = []
                            d_count_pr[lig_ref]["lig queries"] = []
                                   
                        if not pr_ref in d_count_pr[lig_ref]["pr ref"] : 
                            d_count_pr[lig_ref]["pr ref"].append (pr_ref)
                                
                                
                        try:
                            family = analysis.findFamily (pr_ref, pathManage.dataset (lig_ref) + "family_PDB.txt")
                            if not family in d_count_pr[lig_ref].keys () : 
                                d_count_pr[lig_ref][family] = 0
                            d_count_pr[lig_ref][family] = d_count_pr[lig_ref][family] + 1
                        except: pass
                

                # BS -> query
                for file_BS in l_file_BS : 
                    # for not reference BS
                    if not search ("BS_REF", file_BS) : 
                        lig_querie = file_BS.split ("_")[1]
                        prot_querie = file_BS.split ("_")[2][0:4]
                        print prot_querie, lig_querie, "*******"
                        # find ligand reference
                        # lig ref define in previous step
                        d_count_pr[lig_ref]["pr queries"].append (prot_querie)
                        d_count_pr[lig_ref]["lig queries"].append (lig_querie)


    # write and plot #
    ##################
    pr_result = pathManage.generatePath(pr_final_folder + "counting/")
    for ligand_sub in d_count.keys () : 
        p_filout = pr_result + ligand_sub
        filout = open (p_filout, "w")
        filout.write ("\t".join(d_count[ligand_sub].keys ()) + "\n")
        l_value = [str(x) for x in d_count[ligand_sub].values ()]
        filout.write ("\t".join(l_value) + "\n")
        filout.close ()
        runOtherSoft.piePlot(p_filout)
    
    filout_lig = open (pr_result + "count_ligand", "w")
    filout_lig.write ("Ligand ID\tNumber of occurences in the dataset\tNumber of different clusters\tList of clusters\tList of protein families\n")
    for lig in d_lig.keys () : 
        if d_lig[lig] > 1 : 
            filout_lig.write (str (lig) + "\t" + str (d_lig[lig]["count"]) + "\t" + str(len (list (set(d_lig[lig]["group"]))))  + "\t" + " ".join (d_lig[lig]["group"]) + "\t" + " ".join (d_lig[lig]["family"]) + "\n")
    filout_lig.close ()
    
    filout_LSR_lig = open (pr_result + "CountByLigandRef", "w")
    for lig_ref in d_by_ref.keys () : 
        filout_LSR_lig.write ("====" + str (lig_ref) + "====\n")
        for sub_ref in d_by_ref[lig_ref].keys () : 
            filout_LSR_lig.write (str (sub_ref) + ": " + str (d_by_ref[lig_ref][sub_ref]) + "\n")
    filout_LSR_lig.close ()

    filout_pr_count = open (pr_result + "count_pr", "w")
    for lig in d_count_pr.keys () : 
        filout_pr_count.write ("====" + str (lig) + "====\n")
        filout_pr_count.write ("nb ref pr: " + str (len (d_count_pr[lig]["pr ref"])) + "\n")
        filout_pr_count.write ("nb querie pr: " + str (len (d_count_pr[lig]["pr queries"])) + "\n")
        filout_pr_count.write ("nb ligand queries: " + str (len (d_count_pr[lig]["lig queries"])) + "\n")

    for family in d_count_pr[lig].keys () : 
        if family != "pr ref" and family != "pr queries" and family != "lig queries" :
            filout_pr_count.write ("Ref " + str (family) + ": " + str (d_count_pr[lig][family]) + "\n")


    filout_pr_count.close ()

    runOtherSoft.barplot(pr_result + "count_ligand")
Exemplo n.º 2
0
def qualityExtraction (l_ligand, name_folder, p_list_ligand, thresold_sheap) : 
    
    pr_result = pathManage.result("final_" + name_folder)
    
    filout = open(pr_result + "quality_extraction.txt", "w")
    
    # number PDB by ligand, without filter
    filout.write ("Number PDB by ligand:\n")
    
    d_dataset =  tool.parseLigandPDBList(p_list_ligand)
    for ligand in l_ligand : 
        filout.write (str (ligand) + ": " + str (len (d_dataset[ligand])) + "\n")
    
    # number references
    filout.write ("\n*************\n\nNumber references by ligands:\n")
    for ligand in l_ligand : 
        pr_result_ligand = pathManage.result(ligand)
        nb_ref = -2
        l_file = listdir(pr_result_ligand)
        for f in l_file : 
            if path.isdir (pr_result_ligand + "/" + f) : 
                nb_ref = nb_ref + 1
        filout.write (ligand + ": " + str (nb_ref) + "\n")
        
    # number of query by ref in means and max and min (after blast)
    filout.write ("\n*************\n\nNumber means queries by references:\n")
    p_family_all = pathManage.result() + "reference_family_all.txt"
    filout_family_all = open (p_family_all, "w")
    d_family_all = {}
    for ligand in l_ligand : 
        d_nb_query = {}
        d_family = {}
        p_filout_family = pathManage.result() + "reference_family_" + ligand + ".txt"
        p_filout_family_count = pathManage.result () + "count_family_" + ligand + ".txt"
        filout_family = open (p_filout_family, "w")
        filout_family_count = open (p_filout_family_count, "w")
        pr_result_ligand = pathManage.result(ligand)
        nb_ref = 0
        l_file = listdir(pr_result_ligand)
        for f in l_file : 
            if path.isdir (pr_result_ligand + "/" + f) and len (f) == 4: 
                # count by family
                family_ref = analysis.findFamily(f, pathManage.findFamilyFile (ligand))
                filout_family.write ("\t".join (family_ref) + "\n")
                if not family_ref[-1] in d_family.keys () : 
                    d_family[family_ref[-1]] = 0
                d_family[family_ref[-1]] = d_family[family_ref[-1]] + 1
                # file all
                if not family_ref[-1] in d_family_all.keys () : 
                    d_family_all[family_ref[-1]] = 0
                d_family_all[family_ref[-1]] = d_family_all[family_ref[-1]] + 1
                
                # count number of references
                nb_ref = nb_ref + 1
                d_nb_query[f] = 0
                l_file_queries = listdir(pr_result_ligand + "/" + f + "/")
                for file_query in l_file_queries : 
                    if search ("CX",file_query) : 
                        d_nb_query[f] = d_nb_query[f] + 1
        filout.write (ligand + ": " + str(np.sum(d_nb_query.values ())) + "\n")
        filout.write (ligand + ": " + str(np.mean(d_nb_query.values ())) + "+/-" + str(np.std (d_nb_query.values ())) + "\n")
        filout.write ("MAX " + str (ligand) + ": " + str (max (d_nb_query.values ())) + " " + str (d_nb_query.keys ()[d_nb_query.values ().index (max (d_nb_query.values ()))]) +"\n")
    
        # family
        filout_family_count.write ("\t".join(d_family.keys ()) + "\n")
        l_values = [str(x) for x in d_family.values ()]
        filout_family_count.write ("\t".join(l_values) + "\n")
        filout_family.close ()
        filout_family_count.close ()
        runOtherSoft.piePlot(p_filout_family_count)

    # all family
    filout_family_all.write ("\t".join(d_family_all.keys ()) + "\n")
    l_values = [str(x) for x in d_family_all.values ()]
    filout_family_all.write ("\t".join(l_values) + "\n")
    filout_family_all.close ()    
    runOtherSoft.piePlot(p_family_all)
        
    
    # number subref by ligand
    filout.write ("\n*************\n\nNumber of subref considered:\n")
    for ligand in l_ligand :
        d_nb_sub = {}
        d_nb_sub_sheap = {}
        pr_result_ligand = pathManage.result(ligand)
        l_ref = listdir(pr_result_ligand)
        for ref in l_ref : 
            if path.isdir (pr_result_ligand + "/" + ref) and len (ref) == 4: 
                l_file_queries = listdir(pr_result_ligand + "/" + ref + "/")
                for file_query in l_file_queries : 
                    if search ("substituent",file_query) and search (".pdb",file_query): 
                        atom_substituate = file_query.split ("_")[-2]
                        try : value_sheap = float(file_query.split ("_")[-1][:-4])
                        except : continue
                        if not atom_substituate in d_nb_sub.keys () : 
                            d_nb_sub[atom_substituate] = 0
                        d_nb_sub[atom_substituate] = d_nb_sub[atom_substituate] + 1
                        
                        if value_sheap > thresold_sheap : 
                            if not atom_substituate in d_nb_sub_sheap : 
                                d_nb_sub_sheap[atom_substituate] = 0
                            d_nb_sub_sheap[atom_substituate] = d_nb_sub_sheap[atom_substituate] + 1
        filout.write ("\n" + ligand + "\n")
        for atom_substituate in d_nb_sub.keys () : 
            filout.write (atom_substituate + ": " + str (d_nb_sub[atom_substituate]) + "\n")
            try : filout.write (atom_substituate + " ShaEP: " + str (d_nb_sub_sheap[atom_substituate]) + "\n")
            except : filout.write (atom_substituate + " ShaEP: 0\n")
    filout.close()