Пример #1
0
def lanceBlastxFromScaff():
    print "toto"
    allfile = glob.glob("/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/assemblageRef1.0/*.tfa")
    database = "/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/SynteniePistPiso/DBblast/pistPisoProtFmt6"
    repout = "/Users/afutil/Documents/Genolevures/PiFa/AssemblageGenome/AssemblagesFinaux/SynteniePistPiso/BlastxFmt6"
    
    for file in allfile:
        seq = fasta.seqEnVar(file)
        fname = files.get_name(file)
        print "%s\t%s" % (fname,len(seq)) 
        if len(seq) < 2000:
            outfile = "%s/%s.blastx" % (repout,fname)
            alignement.run_blastxFmt(file,outfile,database)
        else :
            fic1 = "%s/%s-deb.tfa" % (repout,fname)
            fic2 = "%s/%s-fin.tfa" % (repout,fname)
            of1 = open(fic1,"w")
            of2 = open(fic2,"w")
            #print ">%s\n%s\n" % (files.get_name(fic1),seq[0:1000])
            of1.write(">%s\n%s\n" % (files.get_name(fic1),seq[0:1000]))
            of2.write(">%s\n%s\n" % (files.get_name(fic2),seq[-1000:]))
            of1.close()
            of2.close()
            for fic in fic1,fic2:
                outfile = "%s/%s.blastx" % (repout,files.get_name(fic))
                alignement.run_blastxFmt(file,outfile,database)
Пример #2
0
def identifyScaffChimereBU(species,infile,rep = "."):

    # tentative d amelioration de la routine de mise en evidence des scaff chimeres
    # resultats non probants!!
    # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent
    # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents
    # du coup, les resultats sont limites pires!!
    # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine 

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
    outfile = infile.replace(".fasta","_chimere.log")
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chimere = 0
                dicProt = {}
                dicProtScore = {}
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            if string.atoi(el[6]) in dicProt:
                                prot = dicProt[string.atoi(el[6])]
                                score = dicProtScore[prot]
                                if score < string.atof(el[11]):
                                    del dicProt[string.atoi(el[6])]
                                    dicProt[string.atoi(el[6])] = el[1]
                                    dicProtScore[el[1]] = string.atof(el[11])
                            else:
                                dicProt[string.atoi(el[6])] = el[1]
                                dicProtScore[el[1]] = string.atof(el[11])
                
                lChr = []
                for key in dicProt.keys():
                    if species.lower() == "sace":
                        if dicProt[key][0:2] not in lChr:
                            lChr.append(dicProt[key][0:2])
                    else:
                        if dicProt[key][0:6] not in lChr:
                            lChr.append(dicProt[key][0:6])
                # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere
                if len(lChr) > 1:
                    lProt = dicProt.items()
                    lProt.sort()
                    print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    print lProt
                    of.write("%s\n" % lProt)
                    fileO2 = fileO.replace("blastx","blastx-std")
                    alignement.run_blastx(fileN,fileO2,db,1000)
                else:
                    os.remove(fileO)
            os.remove(fileN)
    of.close()
Пример #3
0
def identifyScaffChimere(species,infile,rep = "."):

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
        
    #outfile = infile.replace(".fasta","_chimere.log")
    
    ## RUN 25 Strains ##
    outfile = infile.replace(".scafSeq","_chimere.log")
    ##
    
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chr = ""
                chimere = 0
                dicProt = {}
                lP1 = []
                lP2 = []
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50:
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            dicProt[el[1]] = string.atoi(el[6])
                            if chr == "":
                                if species.lower() == "sace":
                                    chr = el[1][0:2]
                                else:
                                    chr = el[1][0:6]
                                if string.atoi(el[6]) not in lP1:
                                    lP1.append(string.atoi(el[6]))
                            elif el[1][0:2] != chr and el[1][0:6] != chr:
                                chimere = 1
                                if string.atoi(el[6]) not in lP2:
                                    lP2.append(string.atoi(el[6]))
                                else:
                                    if string.atoi(el[6]) not in lP1:
                                        lP1.append(string.atoi(el[6]))
                           
                              
                if chimere == 1:
                    # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue
                    chimere = 0
                    if len(lP1) < len(lP2):
                        for p1 in lP1:
                            if p1 not in lP2:
                                chimere = 1
                        else:
                            for p2 in lP2:
                                if p2 not in lP1:
                                    chimere = 1

                if chimere == 1:
                    lProt = dicProt.items()
                    lProt.sort(cmpval)
                    pos = 0
                    lchrom = []
                    # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non
                    #print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    #print lProt
                    toR = 0
                    for prot in lProt:
                        #print prot
                        if prot[1] > pos:
                            if species.lower() == "sace":
                                if prot[0][0:2] not in lchrom:
                                    lchrom.append(prot[0][0:2])
                                    toR = 1
                            else:
                                if prot[0][0:6] not in lchrom:
                                    lchrom.append(prot[0][0:6])
                                    toR = 1
                            pos = prot[1]
                            #print lchrom
                        # les donnees etant triees
                        # si la valeur n est pas superieure, c est qu elle est egale
                        else:
                            if toR == 1:
                                # je retire le dernier chromo mis dans la liste
                                # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position
                                lchrom.pop()
                                # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position
                                toR = 0
                            
                    list(set(lchrom))
                    
                    if species.lower() == "sace":
                        firstChrom = lProt[0][0][0:2]
                        lastChrom = lProt[-1][0][0:2]
                    else:
                        firstChrom = lProt[0][0][0:6]
                        lastChrom = lProt[-1][0][0:6]

                    #if len(lchrom) > 1 and firstChrom != lastChrom:                        
                    if len(lchrom) > 1:
                        print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                        of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    
                        print lProt
                        of.write("%s\n" % lProt)
                        fileO2 = fileO.replace("blastx","blastx-std")
                        alignement.run_blastx(fileN,fileO2,db,1000)
                    else:
                        os.remove(fileO)
                        os.remove(fileN)
                else:
                    os.remove(fileO)
                    os.remove(fileN)
    of.close()