Пример #1
0
def main():

    args = check_options(get_options())

    genomesize = int(os.path.getsize(args.genome)/1e6)

    kmer = int(log(genomesize, 4)+1)

    if kmer < 17:

        kmer = 17

    #jellyfish par
    lowercount = 2

    #jellyfish par
    jfsize = '100M'

    # splite sequence longer than 10M
    spsize = 10000000

    step = args.step

    maxkmerscore = int(args.length * args.homology / 100) - kmer

    jfpool = Pool(args.threads)

    # ?build kmerindex
    jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf'))

    kmerbuild = True

    if os.path.isfile(jfkmerfile):

        if not args.docker:

            print("find:", jfkmerfile)

            kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it?  Press Y or N to continue:"

            print(kmmess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        kmerbuild = True

                    elif char == 'n':

                        kmerbuild = False

                    break


    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa')

    bwaindex = os.path.join(args.saved, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        if not args.docker:

            print('find:', bwatestindex)

            bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:"

            print(bwamess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        bwabuild = True

                    elif char == 'n':

                        bwabuild = False

                    break

    print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:",
          jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads)

    # Build Jellyfish index
    if kmerbuild:

        jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile,
                                    threads=args.threads, lowercount=lowercount, size=jfsize)

        if jfcount:

            print("JellyFish Count finished ...")

        else:

            print("JellyFish Count Error!!!")

            sys.exit(1)

    else:

        print("Use ", jfkmerfile)
    # End build Jellyfish index

    if bwabuild:

        bwa.bwaindex(args.bwa, args.genome, args.saved)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)


    jffilteredprobe = list()

    fastain = Fasta(args.input)

    jffpbrunerlist = list()

    for seqname in fastain.keys():

        chrlen = len(fastain[seqname])

        if chrlen < spsize:

            start = 0

            end = chrlen - 1

            jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

            jffpbrunerlist.append(jffpbruner)

        else:

            chrblock = int(chrlen/spsize) + 1

            for i in range(chrblock):

                start = i * spsize

                end = start + spsize - 1

                if end >= chrlen:

                    end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

                jffpbrunerlist.append(jffpbruner)

    jffinished = 0

    for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

        jffilteredprobe.extend(curpblist)

        jffinished += 1

        print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='')

    jfpool.close()

    print('Jellyfish filter finished!!')

    tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa')

    tmppbfaio = open(tmppbfa, 'w')

    seqnum = 0

    for tmppb in jffilteredprobe:

        print('>','seq',seqnum, sep='',file=tmppbfaio)

        print(tmppb,file=tmppbfaio)

        seqnum += 1

    tmppbfaio.close()

    del jffilteredprobe

    bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length,
                                maxxs=int(args.length*args.homology/100), threadnumber=args.threads)

    # print(bwafiltedpb)

    tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed')

    alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed')

    tmpbwaftlistio = open(tmpbwaftlist,'w')

    allbwaftlistio = open(alltmpbwaftlist,'w')

    seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len')

    seqlenio = open(seqlenfile,'w')

    seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex)

    for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

    seqlenio.close()


    oligobefortmf = list()

    for pbtmp in bwafiltedpb:

        # print(pbtmp, file=tmpbwaftlistio)
        nowpbcounter = dict()

        nowpbcounter['seq'] = pbtmp

        nowpbcounter['dTm'] = args.dtm

        nowpbcounter['rprimer'] = args.primer

        oligobefortmf.append(nowpbcounter)

    keepedprobe = list()

    ctedpb = 0

    oligobefortmflen = len(oligobefortmf)

    print("oligobefortmflen:",oligobefortmflen)

    pbftpool = Pool()

    for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf):

        if keep:

            keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)
        ctedpb += 1

        if ctedpb % 10000 == 0:

            print(ctedpb,'/',oligobefortmflen)

    pbdictbychr = dict()

    pbftpool.close()

    for pb in keepedprobe:

        seq, chro, start = pb.split('\t')

        start = int(start)

        if chro in pbdictbychr:

            pbdictbychr[chro][start] = seq

        else:

            pbdictbychr[chro] = dict()

            pbdictbychr[chro][start] = seq

    lenrprimer = len(args.primer)

    if lenrprimer == 0:

            lenrprimer = 5

    slidwindow = lenrprimer+args.length

    for chro in pbdictbychr:

        startn = 0

        for startnow in sorted(pbdictbychr[chro]):

            endnow = startnow + args.length - 1

            print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

            if startnow > startn+slidwindow:
                    #startn = startnow+slidwindow
                startn = startnow

                print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


    tmpbwaftlistio.close()

    allbwaftlistio.close()

    print("Job finshed!!")
Пример #2
0
    def run(self):

        if self.kmerbuild:

            jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer,
                                          infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber,
                                          lowercount=self.lowercount, size=self.size)

            """
                check jelly fish count run correctly
            """
            if jfcounter:

                self.progressnumber = self.progressnumber + 5

                self.notifyProgress.emit(self.progressnumber)

                self.notifyMessage.emit("JellyFish Count finished...")

            else:

                self.notifyMessage.emit("JellyFish Count Error!!!")

        else:
            jfcountmess = "Use " + self.jfkmerfile
            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
            self.notifyMessage.emit(jfcountmess)

        if self.indexbuild:

            if self.aligner == 'BWA':

                bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder)

                self.notifyMessage.emit("BWA Index build finished...")

                self.progressnumber = self.progressnumber + 5
                self.notifyProgress.emit(self.progressnumber)

            elif self.aligner == 'BLAT':

                """
                    add code for BLAT
                """

                pass
        else:

            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)

        """
            load and splite input file
        """

        # splite sequence longer than 10M
        spsize = 10000000

        maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer

        jffilteredprobe = list()

        fastain = Fasta(self.inputfile)

        jffpbrunerlist = list()


        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)
                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen / spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)

                    jffpbrunerlist.append(jffpbruner)



        jffinished = 0

        for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            tmpprogress = float(format(self.progressnumber + (jffinished/len(jffpbrunerlist) * 40),".2f"))

            self.notifyProgress.emit(tmpprogress)

            if self.isRunning():

                print("running")

            else:

                print("not running")

            jffinished += 1


        self.notifyMessage.emit('jelly fish finished!!')

        self.progressnumber = 50.0

        self.notifyProgress.emit(self.progressnumber)

        tmppbfa = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_tmp_probes.fa')

        tmppbfaio = open(tmppbfa, 'w')

        seqnum = 0

        for tmppb in jffilteredprobe:

            print('>','seq',seqnum, sep='',file=tmppbfaio)


            print(tmppb,file=tmppbfaio)


            seqnum += 1

        tmppbfaio.close()

        #delete jffilteredprobe and release memory
        del jffilteredprobe

        bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile))

        bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength,
                                    maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber)


        tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'.bed')

        alltmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_all.bed')

        tmpbwaftlistio = open(tmpbwaftlist,'w')

        allbwaftlistio = open(alltmpbwaftlist,'w')

        seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile))+'.len'

        seqlenio = open(seqlenfile, 'w')

        seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile)

        for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

        seqlenio.close()


        oligobefortmf = list()

        for pbtmp in bwafiltedpb:

            # print(pbtmp, file=tmpbwaftlistio)
            nowpbcounter = dict()

            nowpbcounter['seq'] = pbtmp

            nowpbcounter['dTm'] = self.dTm

            nowpbcounter['rprimer'] = self.rprimer


            oligobefortmf.append(nowpbcounter)

        keepedprobe = list()

        self.progressnumber = 55

        self.notifyProgress.emit(self.progressnumber)

        ctedpb = 0



        oligobefortmflen = len(oligobefortmf)

        for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf):

            if keep:

                keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)

            ctedpb += 1

            if ctedpb % 10000 == 0:

                tmpprogress = float(format(self.progressnumber + (ctedpb/oligobefortmflen * 30),".2f"))

                self.notifyProgress.emit(tmpprogress)

        self.notifyProgress.emit(90)

        pbdictbychr = dict()

        #load pb to dict
        for pb in keepedprobe:

            # print(pb, file=tmpbwaftlistio)
            seq, chro, start = pb.split('\t')

            start = int(start)

            if chro in pbdictbychr:

                pbdictbychr[chro][start] = seq

            else:

                pbdictbychr[chro] = dict()



                pbdictbychr[chro][start] = seq


        #get lenth of primer
        lenrprimer = len(self.rprimer)

        if lenrprimer == 0:

            lenrprimer = 5

        slidwindow = lenrprimer+self.pblength


        for chro in pbdictbychr:

            startn = 0

            for startnow in sorted(pbdictbychr[chro]):

                endnow = startnow + self.pblength - 1

                print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

                if startnow > startn+slidwindow:

                    #startn = startnow+slidwindow
                    startn = startnow



                    print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


        tmpbwaftlistio.close()

        allbwaftlistio.close()

        #remove temp fasta file
        # os.remove(tmppbfa)

        self.notifyProgress.emit(100)

        self.notifyMessage.emit('all finished!!')