예제 #1
0
def analyseBlastPrelim():
    
    # extrait infos pertinentes des resultats de blastp Fmt6
    repGenome = "/Users/afutil/Documents/Genolevures/Pifa/Annotation/ModelGenes/ProtJigsaw/BlastP/ContrePiso/Eq1n"
    allfile = glob.glob("%s/*-Fmt6.blastp" % repGenome)
    repPiso = "/Users/afutil/Documents/Genolevures/Piso/SeqFinales/FastaProt"
    #repPiso = "/Users/afutil/Documents/Genolevures/Pist/Fasta/CGOB"
    repPab = "/Users/afutil/Documents/Genolevures/Pifa/Annotation/ModelGenes/ProtJigsaw"
    print "SeqPifa\tChromoPifa\tnumGene\tlgSeqPifa\tnbXSeqPifa\tSeqPist\tlgSeqPist\tlgAli\t%ageId\tBestEvalue"
    
    for file in allfile:
        pab = files.get_name(file).replace("-Fmt6","")
        chromo = pab[5:10]
        numGen = pab[12:]
        ficPab = "%s/PIFA.%s.tfa" % (repPab,pab[5:])
        seqPab = fasta.seqEnVar(ficPab) 
        lgPab = len(seqPab)
        nbXPab = seqPab.count('X')
        #print pab
        lgResu = open(file,"r").read().split("\n")
        piso = ""   
        lgAli = []
        id = []
        lgSeqPiso = 0    
        for resu in lgResu:
            if resu != "":
                elem = resu.split("\t")
                #print elem[1][8:]
                if piso == "" or piso == elem[1][8:]:
                    piso = elem[1][8:]
                    #print piso
                    if lgSeqPiso == 0:
                        ficPiso = "%s/%s.tfa" % (repPiso,piso)
                        #print ficPiso
                        lgSeqPiso = len(fasta.seqEnVar(ficPiso))
                        eval = elem[10]
                        #print eval
                    lgAli.append(elem[3])
                    #print lgAli
                    id.append(elem[2])
                    #print id
            else:
                if piso == "":
                    print "%s\t%s\t%s\t%s\t%s\tNo hits found" % (pab,chromo,numGen,lgPab,nbXPab)
                else:
                    idT = 0
                    lgAliT = 0
                    i = 0
                    while i < len(id) :
                        idT += string.atof(id[i])*string.atof(lgAli[i]) 
                        lgAliT += string.atoi(lgAli[i])
                        i += 1
                    ident = idT/lgAliT
                    print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.2f\t%s" % (pab,chromo,numGen,lgPab,nbXPab,piso,lgSeqPiso,lgAliT,ident,eval)
                    break
예제 #2
0
def defNouvelIdent(fic):
    """
    """
    lines = open(fic,"r").read().split("\n")
    for line in lines:
        if line != "":
            nidg = "-"
            nidp = "-"
            idg = "-"
            idp = "-"
            lis = line.split("\t")
            loc1 = lis[0]
            loc2 = lis[2]
                
            if loc1 != "" and loc2 != "":
                
                ficg1 = "FastaGene/%s.tfa" % loc1
                ficg2 = "FastaGene/%s.tfa" % loc2
                ficp1 = "FastaProt/%s.tfa" % loc1
                ficp2 = "FastaProt/%s.tfa" % loc2
                outf = "%s-%s.needle" % (files.get_name(ficg1).lower(),files.get_name(ficg2).lower())
                if os.path.isfile("FastaGene/%s" % outf):
                    sizeg1 = len(fasta.seqEnVar(ficg1))
                    sizeg2 = len(fasta.seqEnVar(ficg2))
                    if sizeg1 > sizeg2:
                        sizeg = sizeg2 
                    else:
                        sizeg = sizeg1 
                    
                    idg = string.atof(alignement.extrait_id_needle("FastaGene/%s" % outf))
                    nidg = alignement.extrait_nbid_needle("FastaGene/%s" % outf)
                    nidg = string.atof(nidg)/sizeg*100
                if os.path.isfile("FastaProt/%s" % outf):
                    sizep1 = len(fasta.seqEnVar(ficp1))
                    sizep2 = len(fasta.seqEnVar(ficp2))
                    if sizep1 > sizep2:
                        sizep = sizep2 
                    else:
                        sizep = sizep1 
                    idp = string.atof(alignement.extrait_id_needle("FastaProt/%s" % outf))
                    nidp = alignement.extrait_nbid_needle("FastaProt/%s" % outf)
                    nidp = string.atof(nidp)/sizep*100
                if idp != "-" and idg != "-":
                    print "%.1f\t%.1f\t%s\t%s\t%.1f\t%.1f\t" % (idg,idp,loc1,loc2,nidg,nidp)
                else: 
                    print "%s\t%s\t%s\t%s\t%s\t%s" % (idg,idp,loc1,loc2,nidg,nidp)
            else:
                print "\t\t%s\t%s" % (loc1,loc2) 
예제 #3
0
def defSimilarite(fic):
    """
    """
    lines = open(fic,"r").read().split("\n")
    for line in lines:
        if line != "":
            nsimg = "-"
            nsimp = "-"
            simg = "-"
            simp = "-"
            lis = line.split("\t")
            loc1 = lis[0]
            loc2 = lis[2]
                
            if loc1 != "" and loc2 != "":
                
                ficg1 = "FastaGene/%s.tfa" % loc1
                ficg2 = "FastaGene/%s.tfa" % loc2
                ficp1 = "FastaProt/%s.tfa" % loc1
                ficp2 = "FastaProt/%s.tfa" % loc2
                outf = "%s-%s.needle" % (files.get_name(ficg1).lower(),files.get_name(ficg2).lower())
                if os.path.isfile("FastaGene/%s" % outf):
                    sizeg1 = len(fasta.seqEnVar(ficg1))
                    sizeg2 = len(fasta.seqEnVar(ficg2))
                    if sizeg1 > sizeg2:
                        sizeg = sizeg2 
                    else:
                        sizeg = sizeg1 
                    
                    simg = string.atof(alignement.extrait_sim_needle("FastaGene/%s" % outf))
                    nsimg = alignement.extrait_nbsim_needle("FastaGene/%s" % outf)
                    nsimg = string.atof(nsimg)/sizeg*100
                if os.path.isfile("FastaProt/%s" % outf):
                    sizep1 = len(fasta.seqEnVar(ficp1))
                    sizep2 = len(fasta.seqEnVar(ficp2))
                    if sizep1 > sizep2:
                        sizep = sizep2 
                    else:
                        sizep = sizep1 
                    simp = string.atof(alignement.extrait_sim_needle("FastaProt/%s" % outf))
                    nsimp = alignement.extrait_nbsim_needle("FastaProt/%s" % outf)
                    nsimp = string.atof(nsimp)/sizep*100
                if simp != "-" and simg != "-":
                    print "%.1f\t%.1f\t%s\t%s\t%.1f\t%.1f\t" % (simg,simp,loc1,loc2,nsimg,nsimp)
                else: 
                    print "%s\t%s\t%s\t%s\t%s\t%s" % (simg,simp,loc1,loc2,nsimg,nsimp)
            else:
                print "\t\t%s\t%s" % (loc1,loc2) 
예제 #4
0
def lanceBlastxFromScaff():
    print "toto"
    allfile = glob.glob("/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/assemblageRef1.0/*.tfa")
    database = "/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/SynteniePistPiso/DBblast/pistPisoProtFmt6"
    repout = "/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/SynteniePistPiso/BlastxFmt6"
    
    for file in allfile:
        seq = fasta.seqEnVar(file)
        fname = files.get_name(file)
        print "%s\t%s" % (fname,len(seq)) 
        if len(seq) < 2000:
            outfile = "%s/%s.blastx" % (repout,fname)
            alignement.run_blastxFmt(file,outfile,database)
        else :
            fic1 = "%s/%s-deb.tfa" % (repout,fname)
            fic2 = "%s/%s-fin.tfa" % (repout,fname)
            of1 = open(fic1,"w")
            of2 = open(fic2,"w")
            #print ">%s\n%s\n" % (files.get_name(fic1),seq[0:1000])
            of1.write(">%s\n%s\n" % (files.get_name(fic1),seq[0:1000]))
            of2.write(">%s\n%s\n" % (files.get_name(fic2),seq[-1000:]))
            of1.close()
            of2.close()
            for fic in fic1,fic2:
                outfile = "%s/%s.blastx" % (repout,files.get_name(fic))
                alignement.run_blastxFmt(file,outfile,database)
예제 #5
0
def verifLongEtExtrGene():
    allfile = glob.glob("/Users/anfutil/Documents/Genolevures/Pifa/Annotation/PropositionGenesSelontBlastN/Combinaison/*.tfa")
    print "GeneName\tSeqLen"
    for file in allfile:
        seq =  fasta.seqEnVar(file)
        if seq[0:3] == "ATG":
            #if seq[-3:] == "TAA" or seq[-3:] == "TAG" or seq[-3:] == "TGA":
            if seq[-3:] in ["TAA","TGA","TAG"]:
                if len(seq) < 900:
                    print "%s\t%s" % (files.get_name(file), len(seq))
예제 #6
0
def statsAce(inrep):
    #pour chaque scaff et contig
    # recuperation du nom, de la taille de la sequence et du nombre de N
    allfile = glob.glob("%s/*" % inrep)
    for file in allfile:
        nom = files.get_name(file)
        nom = string.replace(nom,"_L13","")
        seq = fasta.seqEnVar(file)
        nbN = seq.lower().count("n")
    
        print "%s\t%s\t%s\t" % (nom,len(seq),nbN)
예제 #7
0
def creeSequenceDeGene(scaff,deb,fin):
    
    header = "%s %s %s" % (scaff,deb,fin)
    filename = "%s-%s-%s.tfa" % (scaff,deb,fin)
        
    ficScaff = "ScaffTfa/%s.tfa" % scaff
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin))
    if os.path.isfile(filename):
        filename = "%s-2" % filename
    fasta.fromSeqToFasta(seqGen,header,filename)
예제 #8
0
def creeSequenceDeGene(scaff,deb,fin,geneN,repGene,repSeq):
    
    header = "%s-%s-%s" % (scaff,deb,fin)
    filename = "%s/%s_%s-%s-%s.tfa" % (repGene,geneN,scaff,deb,fin)
        
    ficScaff = "%s/%s.tfa" % (repSeq,scaff)
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin))
    if os.path.isfile(filename):
        print "il existe deja"
    if not os.path.isfile(filename):
        fasta.fromSeqToFasta(seqGen,header,filename)
예제 #9
0
def lanceMicroPipe():
    if not os.path.isdir("SeqGene"):
        os.mkdir("SeqGene")
    if not os.path.isdir("SeqProt"):
        os.mkdir("SeqProt")
    if not os.path.isdir("AliGene"):
        os.mkdir("AliGene")
    if not os.path.isdir("AliProt"):
        os.mkdir("AliProt")
    if not os.path.isdir("PairwiseGene"):
        os.mkdir("PairwiseGene")
    if not os.path.isdir("PairwiseProt"):
        os.mkdir("PairwiseProt")

    blastDB = "/BlastDB/Nucleic/Sace/1002Genomes.nt"

    for gene in glob.glob("*fasta"):
        long = len(fasta.seqEnVar("%s" % gene)) * 3
        #print long
        geneN = files.get_name(gene)
        ficOut = "%s.blastn" % (geneN)
        cmdBl1 = "blast -query %s -db %s -out %s -seg no -soft_masking false" % (gene, blastDB, ficOut)
        cmdBl2 = "blastn -query %s -db %s -out %s-6 -outfmt 6 -seg no m-soft_masking false" % (gene, blastDB, ficOut)
        os.system(cmdBl1)
        os.system(cmdBl2)

        lanceRecupSeq("%s-6" % ficOut, long, geneN)
        #
        concatSeqAAligner(geneN, "SeqGene", "AliGene/%s_all.fasta" % geneN)
        # alignement de ces sequences
        alignement.run_mafft("AliGene/%s_all.fasta" % geneN, "AliGene/%s_all-ali.tfa" % geneN)
        os.system("rm AliGene/%s_all.fasta" % geneN)

        concatPaireSeqAAligner(geneN, "SeqGene", "PairwiseGene/%s.fasta" % geneN)
        alignement.run_mafft("PairwiseGene/%s.fasta" % geneN, "PairwiseGene/%s-ali.fasta" % geneN)
        lignes = open("PairwiseGene/%s-ali.fasta" % geneN, "r").read().split("\n")
        if len(lignes) == 1:
            os.system("rm PairwiseGene/%s-ali.fasta" % geneN)
            os.system("rm PairwiseGene/%s.fasta" % geneN)

        creeSeqProtSelonSeqGene("%s*" % (geneN), "SeqGene", "SeqProt")
        concatPaireSeqAAligner(geneN, "SeqProt", "PairwiseProt/%s.fasta" % geneN)
        alignement.run_mafft("PairwiseProt/%s.fasta" % geneN, "PairwiseProt/%s-ali.fasta" % geneN)
        lignes = open("PairwiseProt/%s-ali.fasta" % geneN, "r").read().split("\n")
        if len(lignes) == 1:
            os.system("rm PairwiseProt/%s-ali.fasta" % geneN)
            os.system("rm PairwiseProt/%s.fasta" % geneN)

        concatSeqAAligner(geneN, "SeqProt", "AliProt/%s_all.fasta" % geneN)
        # alignement de ces sequences
        alignement.run_mafft("AliProt/%s_all.fasta" % geneN, "AliProt/%s_all-ali.tfa" % geneN)
        os.system("rm AliProt/%s_all.fasta" % geneN)
예제 #10
0
def defSimilaritePseudo(fic):
    """
    """
    lines = open(fic,"r").read().split("\n")
    for line in lines:
        if line != "":
            nsimg = "-"
            simg = "-"
            lis = line.split("\t")
            loc1 = lis[0]
            loc2 = lis[2]
                
            if loc1 != "" and loc2 != "":
                
                ficg1 = "Genes+Pseudos/%s.tfa" % loc1
                ficg2 = "Genes+Pseudos/%s.tfa" % loc2
                outf = "%s-%s.needle" % (files.get_name(ficg1).lower(),files.get_name(ficg2).lower())
                
                if not os.path.isfile("Genes+Pseudos/%s" % outf):
                    alignement.ali_needle(ficg1,ficg2)
                    
                    sizeg1 = len(fasta.seqEnVar(ficg1))
                    sizeg2 = len(fasta.seqEnVar(ficg2))
                    if sizeg1 > sizeg2:
                        sizeg = sizeg2 
                    else:
                        sizeg = sizeg1 
                     
                    simg = string.atof(alignement.extrait_sim_needle("Genes+Pseudos/%s" % outf))
                    nsimg = alignement.extrait_nbsim_needle("Genes+Pseudos/%s" % outf)
                    nsimg = string.atof(nsimg)/sizeg*100
                
                if simg != "-":
                    print "%.1f\t%s\t%s\t%.1f\t" % (simg,loc1,loc2,nsimg)
                else: 
                    print "%s\t%s\t%s\t%s" % (simg,loc1,loc2,nsimg)
            else:
                print "\t\t%s\t%s" % (loc1,loc2) 
예제 #11
0
def extraitInfoDispersionSevScaffDansBlast():
    print "scaff\tseq length\tnb hits\tdiffScaff" 
    rep = "/Users/anfutil/Documents/Genolevures/Pifa/Annotation"
    #lignes = open(fic,"r").read().split("\n")   
   
    allfile = glob.glob("%s/tBlastNFmt6-Pist/*.tblastn" % rep)
        
    for file in allfile:
        nbHit = 0
        fname = files.get_name(file)
        sqLen = len(fasta.seqEnVar("%s/Proteome-Pist/%s.tfa" % (rep,fname)))
        lines = open(file,"r").read().split("\n")   
        expect = 0
        scaff = ""
        difSc = 0
        min = 1000000000
        max = 0
        for line in lines:
            if line != "":
                lis = line.split("\t")
                expect = lis[10]
                if expect.find("e") != -1 or expect =="0.0":
                    # si je couvre plus de 90% de ma seq de depart, je considere que l orf est complete
                    if (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.9:
                        break
                    # si j ai plus de 30% d identite sur au moins 10% de la seq soumise
                    # je considere que mon hit est potentiellement interessant
                    if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.1:
                        
                        if lis[1] == scaff:
                            if string.atof(lis[7]) > string.atoi(lis[6]):
                                min = minimum(min,string.atoi(lis[6]))
                                max = maximum(max,string.atoi(lis[7]))
                            else:
                                min = minimum(min,string.atoi(lis[6]))
                                max = maximum(max,string.atoi(lis[7]))
                            continue
                        else : 
                            maxN = maximum(string.atoi(lis[6]),string.atoi(lis[7]))
                            minN = minimum(string.atoi(lis[6]),string.atoi(lis[7]))
                            if maxN < max and minN > min:
                                break
                            else:
                                difSc += 1
                                print line
                                max = maximum(max,maxN)
                                min = minimum(min,minN)
                        scaff = lis[1]
        if nbHit > 1:
            print "%s\t%s\t%s hits\t%s\t%s" % (fname,sqLen,nbHit,mmSc,difSc)
예제 #12
0
def identifieRecouvrement100():
    # va parser tous les blast 
    allfile = glob.glob("/Users/anfutil/Documents/Genolevures/Pifa/Annotation/tBlastNFmt6-Pist/*.tblastn")
    for file in allfile:
        # recupere la taille de la sequence traitee
        name = files.get_name(file)
        ficSeq = "/Users/anfutil/Documents/Genolevures/Pifa/Annotation/Proteome-Pist/%s.tfa" % name
        #ficSeq = "/Users/anfutil/Documents/Genolevures/Piso/SeqFinales/FastaProt/%s.tfa" % name
        sqLen = len(fasta.seqEnVar(ficSeq))
        # compare a la taille de la sequence alignees
        lines = open(file,"r").read().split("\n")
        if lines[0] != "":
            el = lines[0].split("\t")
            long = string.atoi(el[7]) - string.atoi(el[6]) + 1
            if sqLen == long:
                print "%s\t%s-%s-%s.tfa\t%s" % (name,el[1],el[8],el[9],el[2])
예제 #13
0
def recapGeneProt():
    # nb de genes / taille / pourcentage de N / nb de genes avec des N
    allfile = glob.glob("./FastaProt/*.tfa")
    idxNbG = 0
    idxNbGN = 0
    nbNTot = 0
    lgSeqTot  = 0
    for file in allfile:
        nbN = 0
        name = files.get_name(file)
        idxNbG += 1
        seq  = fasta.seqEnVar(file)
        lgSeq = len(seq)
        lgSeqTot = lgSeqTot + lgSeq
        if "N" in seq:
            idxNbGN +=1
            nbN = seq.count('N')
            nbNTot = nbNTot + nbN
        print "%s\t%s\t%s\t" % (name,lgSeq,nbN)
    
    print "Au final\n%s prot dont %s contiennent au moins 1 \"N\", taille cumulee : %s pb dont %s N" % (idxNbG, idxNbGN,lgSeqTot,nbNTot)
예제 #14
0
def creeSequenceDeGene(scaff, deb, fin, geneN, repGene, repSeq, long):
    header = "%s" % (scaff)
    filename = "%s/%s_%s-%s-%s.tfa" % (repGene, geneN, scaff, deb, fin)

    ficScaff = "%s/%s.tfa" % (repSeq, scaff)
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq, string.atoi(deb), string.atoi(fin)).upper()
    # creation de la sequence uniquement si taille > a 98% de la sequence reference et si la taille est un multiple de 3
    #if (len(seqGen) > long * 0.85 or len(seqGen) > 500) and len(seqGen) % 3 == 0:
    if 1 == 1:

        if os.path.isfile(filename):
            print "il existe deja"
        else:
            seqGen = seqGen.replace("X", "N")
            seqGen = seqGen.replace("S", "N")
            seqGen = seqGen.replace("W", "N")
            seqGen = seqGen.replace("R", "N")
            seqGen = seqGen.replace("Y", "N")
            seqGen = seqGen.replace("K", "N")
            seqGen = seqGen.replace("M", "N")
            fasta.fromSeqToFasta(seqGen, header, filename)
예제 #15
0
def recapIdentiteFYIL():
    workdir = "/Users/afutil/Documents/DataJoseph/Incompatibilites/FY4-IL01/MappedRegions/"
    allfile = glob.glob("%sGeneFY/RegionChr8/*.tfa" % workdir)
    repAliG = "%sGeneIL01/RegionChr8/Alignements/" % workdir
    repAliP = "%sProtIL01/RegionChr8/Alignements/" % workdir
    print "eltType\tgeneN\tgeneLength\tnbSnp\tAliLen\t%ageIdMoyen\tcouverture\tdetail/ali"
    for file in allfile:
        #print file
        lid = []
        lnbSnp = []
        lLen = []
        #print file
        #seqG = fasta.seqEnVar(file)
        #print seqG
        seqLen = len(fasta.seqEnVar(file))
        allAli = glob.glob("%s%s*" % (repAliG,files.get_name(file)))
        if allAli != []:
            #print allAli
            for ali in allAli:
                longAli = alignement.extrait_lg_water(ali)
                lid.append(alignement.extrait_id_water(ali))
                nbId = alignement.extrait_nbid_water(ali)
                lnbSnp.append(string.atoi(longAli) - string.atoi(nbId))
                lLen.append(longAli)
            toWr = "Gene\t%s\t%stoto" % (files.get_name(file),seqLen)
            tId = 0
            tSnp = 0 
            tLen = 0
            i = 0
            for snp in lnbSnp:
                long = lLen[i]
                id = lid[i]
                tId += string.atof(id) * string.atof(long)
                tLen += string.atof(long)
                tSnp += snp
                toWr += "\t%s\t%s" % (snp,long)
                i += 1
            meanId = tId / tLen 
            meanCov = tLen / seqLen * 100
            toWr = string.replace(toWr,"toto","\t%s\t%.0f\t%.2f\t%.0f" % (tSnp,tLen,meanId,meanCov))
            #toWr += "\t%s\t%.0f\t%.2f\t%.0f" % (tSnp,tLen,meanId,meanCov)
            print toWr    
        allAliP = glob.glob("%s%s*" % (repAliP,files.get_name(file)))
        lid = []
        lnbSnp = []
        lLen = []
        if allAliP != []:
            #print allAli
            for ali in allAliP:
                longAli = alignement.extrait_lg_water(ali)
                lid.append(alignement.extrait_id_water(ali))
                nbId = alignement.extrait_nbid_water(ali)
                lnbSnp.append(string.atoi(longAli) - string.atoi(nbId))
                lLen.append(longAli)
            toWr = "Prot\t%s\t%stoto" % (files.get_name(file),seqLen/3)
            tId = 0
            tSnp = 0 
            tLen = 0
            i = 0
            for snp in lnbSnp:
                long = lLen[i]
                id = lid[i]
                tId += string.atof(id) * string.atof(long)
                tLen += string.atof(long)
                tSnp += snp
                toWr += "\t%s\t%s" % (snp,long)
                i += 1
            meanId = tId / tLen 
            meanCov = tLen / (seqLen/3) * 100
            toWr = string.replace(toWr,"toto","\t%s\t%.0f\t%.2f\t%.0f" % (tSnp,tLen,meanId,meanCov))
            #toWr += "\t%s\t%.0f\t%.2f\t%.0f" % (tSnp,tLen,meanId,meanCov)
            print toWr    
예제 #16
0
def trouveTelom(fic):
    sequence = fasta.seqEnVar(fic)
    print sequence
예제 #17
0
def extraitInfoBlastBIS(fic):
    
    """
    extrait id + %age identite + long ali + long seq1 et seq2 
    d un fichier blast formate avec l option -outfmt 6
    prend en entree le repertoire qui contient l ensemble des fichiers a traiter
    """
    
    rep = "/Users/afutil/Documents/Genolevures/PiSo/AnalyseGenome/CladeCTG/RechercheSimFmt6"
    lignes = open(fic,"r").read().split("\n")  
    
   
    #allfile = glob.glob("%s/*.blastp" % inrep)
    outfile = "outputAnaBlast"
    outfile2 = "outTest.tab"
    f = open(outfile,"w")
    f2 = open(outfile2,"w")
    
    for ligne in lignes:
        el = ligne.split("\t")
        fname = el[1].lower()
        #categorie = el[1]
        if fname == "" or fname[-1].lower() == "r":
            print ""
            f.write("\n\n")
            f2.write("%s\n" % fname) 
            continue
        
        file = "%s/%s.blastp" % (rep,fname.lower())
        print fname    
        
        if os.path.isfile("/Users/afutil/Documents/Genolevures/PiSo/SeqFinales/FastaProt/%s.tfa" % fname.upper()) and os.path.isfile(file):
            sqLen = len(fasta.seqEnVar("/Users/afutil/Documents/Genolevures/PiSo/SeqFinales/FastaProt/%s.tfa" % fname.upper()))
        else:
            f.write("\n\n")
            f2.write("%s\n" % fname) 
            continue
        
        print sqLen
        debha = 0; canal = 0; cantr = 0; picgu = 0; picst = 0; canlu = 0; lodel = 0; canpa = 0; candu = 0
        debha2 = 0; canal2 = 0; cantr2 = 0; picgu2 = 0; picst2 = 0; canlu2 = 0; lodel2 = 0; canpa2 = 0; candu2 = 0
        deb = ""; caa = ""; cat = ""; pig = ""; pis = ""; cal = ""; lod = ""; cap = ""; cad = ""    
        lines = open(file,"r").read().split("\n")
        expect = 0
        for line in lines:
            if line != "":
                lis = line.split("\t")
                expect = lis[10]
                print "expect est %s" % expect
                if expect.find("e") != -1 or expect =="0.0":
                    print "a suis rentre"
                    acc = lis[1]
                    if acc[9:13] == "CPAG":
                        if canpa == 0:
                            cap = "%s,%s,%s,%s" % (lis[1][3:13],sqLen,lis[2],lis[3])
                        if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                            canpa = 1
                            canpa2 += 1 
                    else:
                        suff = acc[17:]
                        if suff == "DEBHA":
                            if debha == 0:
                                deb = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            print "%s - %s " % (string.atof(lis[2]),(string.atof(lis[7])-string.atoi(lis[6]))/sqLen)
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                print "+"
                                debha = 1
                                debha2 += 1
                        elif suff == "CANAL":
                            if canal == 0:
                                caa = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                canal = 1
                                canal2 += 1
                        elif suff == "CANTT" or suff == "CANTR":
                            if cantr == 0:
                                cat = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                cantr = 1
                                cantr2 += 1
                        elif suff == "PICGU":
                            if picgu == 0:
                                pig = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                picgu = 1
                                picgu2 +=1
                        elif suff == "PICST":
                            if picst == 0:
                                pis = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                picst = 1
                                picst2 += 1               
                        elif suff == "CLALS" or suff == "CLAL4":
                            if canlu == 0:
                                cal = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                canlu = 1
                                canlu2 += 1    
                        elif suff == "LODEL":
                            if lodel == 0:
                                lod = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                lodel = 1
                                lodel2 += 1  
                        elif suff == "CANDU" or suff == "CANDC":
                            if candu == 0:
                                cad = "%s,%s,%s,%s" % (lis[1][10:],sqLen,lis[2],lis[3])
                            if string.atof(lis[2]) > 30 and (string.atof(lis[7])-string.atoi(lis[6]))/sqLen>0.3:
                                candu = 1
                                candu2 += 1                          
                else:
                    print "a suis pas rentre"
                    break
        f.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fname,deb,pis,pig,cal,caa,cad,cat,cap,lod)) 
        #print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (fname,debha,canal,cantr,picgu,picst,canlu,lodel,canpa)                
        f.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fname,debha,picst,picgu,canlu,canal,candu,cantr,canpa,lodel))
        f2.write ("%s\t'%s%s%s%s%s%s%s%s%s'\t'%s;%s;%s;%s;%s;%s;%s;%s;%s'\n" % (fname,debha,picst,picgu,canlu,canal,candu,cantr,canpa,lodel,debha2,picst2,picgu2,canlu2,canal2,candu2,cantr2,canpa2,lodel2))
        print "%s\t'%s%s%s%s%s%s%s%s%s'\t'%s;%s;%s;%s;%s;%s;%s;%s;%s'" % (fname,debha,picst,picgu,canlu,canal,candu,cantr,canpa,lodel,debha2,picst2,picgu2,canlu2,canal2,candu2,cantr2,canpa2,lodel2)     
    f.close()
    f2.close()
예제 #18
0
def tailleGenesPiso():
     
    allfile = glob.glob("/Users/afutil/Documents/Genolevures/PiSo/SeqFinales/FastaGene/*tfa")
    for file in allfile:
        lgseq = len(fasta.seqEnVar(file))
        print "%s\t%s" % (files.get_name(file),lgseq)