示例#1
0
def creeSequenceDeGene(scaff,deb,fin):
    
    header = "%s %s %s" % (scaff,deb,fin)
    filename = "%s-%s-%s.tfa" % (scaff,deb,fin)
        
    ficScaff = "ScaffTfa/%s.tfa" % scaff
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin))
    if os.path.isfile(filename):
        filename = "%s-2" % filename
    fasta.fromSeqToFasta(seqGen,header,filename)
def creeSequenceDeGene(scaff,deb,fin,geneN,repGene,repSeq):
    
    header = "%s-%s-%s" % (scaff,deb,fin)
    filename = "%s/%s_%s-%s-%s.tfa" % (repGene,geneN,scaff,deb,fin)
        
    ficScaff = "%s/%s.tfa" % (repSeq,scaff)
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin))
    if os.path.isfile(filename):
        print "il existe deja"
    if not os.path.isfile(filename):
        fasta.fromSeqToFasta(seqGen,header,filename)
示例#3
0
def GC_Cleft():
	liStrains = ['55-86_1','62-1041','CBS3082a','CBS3082b','77-1003','NCYC543','62-196','CBS6545','CBS6546','CBS6547','CBS6626','NRBC1892','CBS10367','CBS10368','CBS4104','68917-2','DBVPG4002','67-588','NRBC1811','NRBC10572','NRBC10955','NRBC101999','CBS10369','CBS5828','dd281a','CBS2861','CBS4568','DBVPG3452','DBVPG3108']

	rep = "/Volumes/BioSan/Users/friedrich/GB-3G/BWA/Nuclear/CleanPE"
    	for strain in liStrains:
        	repStrain = "%s/%s" % (rep,strain)
		seqStrain = "%s/cons%s.fasta" % (repStrain,strain)
		seqC = fasta.multiSeqEnVar(seqStrain,"Sakl0C")
		header = "cleft_%s" % strain
		fout = "%s/%s.fasta" % (repStrain,header)
		fasta.fromSeqToFasta(seqC[:989693],header,fout)
		print strain
		GC.txGC(seqC[:989693])
示例#4
0
def creeSequenceDeGene(scaff, deb, fin, geneN, repGene, repSeq, long):
    header = "%s" % (scaff)
    filename = "%s/%s_%s-%s-%s.tfa" % (repGene, geneN, scaff, deb, fin)

    ficScaff = "%s/%s.tfa" % (repSeq, scaff)
    seq = fasta.seqEnVar(ficScaff)
    seqGen = extraitSeqGene(seq, string.atoi(deb), string.atoi(fin)).upper()
    # creation de la sequence uniquement si taille > a 98% de la sequence reference et si la taille est un multiple de 3
    #if (len(seqGen) > long * 0.85 or len(seqGen) > 500) and len(seqGen) % 3 == 0:
    if 1 == 1:

        if os.path.isfile(filename):
            print "il existe deja"
        else:
            seqGen = seqGen.replace("X", "N")
            seqGen = seqGen.replace("S", "N")
            seqGen = seqGen.replace("W", "N")
            seqGen = seqGen.replace("R", "N")
            seqGen = seqGen.replace("Y", "N")
            seqGen = seqGen.replace("K", "N")
            seqGen = seqGen.replace("M", "N")
            fasta.fromSeqToFasta(seqGen, header, filename)
def identifyScaffChimereBU(species,infile,rep = "."):

    # tentative d amelioration de la routine de mise en evidence des scaff chimeres
    # resultats non probants!!
    # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent
    # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents
    # du coup, les resultats sont limites pires!!
    # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine 

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
    outfile = infile.replace(".fasta","_chimere.log")
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chimere = 0
                dicProt = {}
                dicProtScore = {}
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            if string.atoi(el[6]) in dicProt:
                                prot = dicProt[string.atoi(el[6])]
                                score = dicProtScore[prot]
                                if score < string.atof(el[11]):
                                    del dicProt[string.atoi(el[6])]
                                    dicProt[string.atoi(el[6])] = el[1]
                                    dicProtScore[el[1]] = string.atof(el[11])
                            else:
                                dicProt[string.atoi(el[6])] = el[1]
                                dicProtScore[el[1]] = string.atof(el[11])
                
                lChr = []
                for key in dicProt.keys():
                    if species.lower() == "sace":
                        if dicProt[key][0:2] not in lChr:
                            lChr.append(dicProt[key][0:2])
                    else:
                        if dicProt[key][0:6] not in lChr:
                            lChr.append(dicProt[key][0:6])
                # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere
                if len(lChr) > 1:
                    lProt = dicProt.items()
                    lProt.sort()
                    print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    print lProt
                    of.write("%s\n" % lProt)
                    fileO2 = fileO.replace("blastx","blastx-std")
                    alignement.run_blastx(fileN,fileO2,db,1000)
                else:
                    os.remove(fileO)
            os.remove(fileN)
    of.close()
def identifyScaffChimere(species,infile,rep = "."):

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
        
    #outfile = infile.replace(".fasta","_chimere.log")
    
    ## RUN 25 Strains ##
    outfile = infile.replace(".scafSeq","_chimere.log")
    ##
    
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chr = ""
                chimere = 0
                dicProt = {}
                lP1 = []
                lP2 = []
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50:
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            dicProt[el[1]] = string.atoi(el[6])
                            if chr == "":
                                if species.lower() == "sace":
                                    chr = el[1][0:2]
                                else:
                                    chr = el[1][0:6]
                                if string.atoi(el[6]) not in lP1:
                                    lP1.append(string.atoi(el[6]))
                            elif el[1][0:2] != chr and el[1][0:6] != chr:
                                chimere = 1
                                if string.atoi(el[6]) not in lP2:
                                    lP2.append(string.atoi(el[6]))
                                else:
                                    if string.atoi(el[6]) not in lP1:
                                        lP1.append(string.atoi(el[6]))
                           
                              
                if chimere == 1:
                    # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue
                    chimere = 0
                    if len(lP1) < len(lP2):
                        for p1 in lP1:
                            if p1 not in lP2:
                                chimere = 1
                        else:
                            for p2 in lP2:
                                if p2 not in lP1:
                                    chimere = 1

                if chimere == 1:
                    lProt = dicProt.items()
                    lProt.sort(cmpval)
                    pos = 0
                    lchrom = []
                    # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non
                    #print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    #print lProt
                    toR = 0
                    for prot in lProt:
                        #print prot
                        if prot[1] > pos:
                            if species.lower() == "sace":
                                if prot[0][0:2] not in lchrom:
                                    lchrom.append(prot[0][0:2])
                                    toR = 1
                            else:
                                if prot[0][0:6] not in lchrom:
                                    lchrom.append(prot[0][0:6])
                                    toR = 1
                            pos = prot[1]
                            #print lchrom
                        # les donnees etant triees
                        # si la valeur n est pas superieure, c est qu elle est egale
                        else:
                            if toR == 1:
                                # je retire le dernier chromo mis dans la liste
                                # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position
                                lchrom.pop()
                                # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position
                                toR = 0
                            
                    list(set(lchrom))
                    
                    if species.lower() == "sace":
                        firstChrom = lProt[0][0][0:2]
                        lastChrom = lProt[-1][0][0:2]
                    else:
                        firstChrom = lProt[0][0][0:6]
                        lastChrom = lProt[-1][0][0:6]

                    #if len(lchrom) > 1 and firstChrom != lastChrom:                        
                    if len(lchrom) > 1:
                        print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                        of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    
                        print lProt
                        of.write("%s\n" % lProt)
                        fileO2 = fileO.replace("blastx","blastx-std")
                        alignement.run_blastx(fileN,fileO2,db,1000)
                    else:
                        os.remove(fileO)
                        os.remove(fileN)
                else:
                    os.remove(fileO)
                    os.remove(fileN)
    of.close()
示例#7
0
def decomposeChromos():
	liStrains = ['55-86_1','62-1041','CBS3082a','CBS3082b','77-1003','NCYC543','62-196','CBS6545','CBS6546','CBS6547','CBS6626','NRBC1892','CBS10367','CBS10368','CBS4104','68917-2','DBVPG4002','67-588','NRBC1811','NRBC10572','NRBC10955','NRBC101999','CBS10369','CBS5828','dd281a','CBS2861','CBS4568','DBVPG3452','DBVPG3108']
	rep = "/Volumes/BioSan/Users/friedrich/GB-3G/BWA/Nuclear/CleanPE"
    	for strain in liStrains:
        	repStrain = "%s/%s" % (rep,strain)
		seqStrain = "%s/cons%s.fasta" % (repStrain,strain)
		seqA = fasta.multiSeqEnVar(seqStrain,"Sakl0A")
		seqB = fasta.multiSeqEnVar(seqStrain,"Sakl0B")
		seqC = fasta.multiSeqEnVar(seqStrain,"Sakl0C")
		seqD = fasta.multiSeqEnVar(seqStrain,"Sakl0D")
		seqE = fasta.multiSeqEnVar(seqStrain,"Sakl0E")
		seqF = fasta.multiSeqEnVar(seqStrain,"Sakl0F")
		seqG = fasta.multiSeqEnVar(seqStrain,"Sakl0G")
		seqH = fasta.multiSeqEnVar(seqStrain,"Sakl0H")
		headerA = "Sakl0A_%s" % strain
		headerB = "Sakl0B_%s" % strain
		headerC = "Sakl0C_%s" % strain
		headerD = "Sakl0D_%s" % strain
		headerE = "Sakl0E_%s" % strain
		headerF = "Sakl0F_%s" % strain
		headerG = "Sakl0G_%s" % strain
		headerH = "Sakl0H_%s" % strain
		foutA = "%s/%s.fasta" % (repStrain,headerA)
		foutB = "%s/%s.fasta" % (repStrain,headerB)
		foutC = "%s/%s.fasta" % (repStrain,headerC)
		foutD = "%s/%s.fasta" % (repStrain,headerD)
		foutE = "%s/%s.fasta" % (repStrain,headerE)
		foutF = "%s/%s.fasta" % (repStrain,headerF)
		foutG = "%s/%s.fasta" % (repStrain,headerG)
		foutH = "%s/%s.fasta" % (repStrain,headerH)
		fasta.fromSeqToFasta(seqA,headerA,foutA)
		fasta.fromSeqToFasta(seqB,headerB,foutB)
		fasta.fromSeqToFasta(seqC,headerC,foutC)
		fasta.fromSeqToFasta(seqD,headerD,foutD)
		fasta.fromSeqToFasta(seqE,headerE,foutE)
		fasta.fromSeqToFasta(seqF,headerF,foutF)
		fasta.fromSeqToFasta(seqG,headerG,foutG)
		fasta.fromSeqToFasta(seqH,headerH,foutH)