Пример #1
0
def blasr(query, target, format, nproc = 1, outname = "out.m5", consensus=True):
    """
    Simple mapper
    """
    cmd = ("blasr %s %s %s -nproc %d -bestn 1 -out %s ") \
           % (query, target, format, nproc, outname)
    #need to figure out how to m5-pie it...maybe
    if consensus:
        r, o, e = exe(cmd + BLASRPARAMS)
    else:
        r, o, e = exe(cmd + EEBLASRPARAMS)
    logging.debug("blasr - %d - %s - %s" % (r, o, e))
Пример #2
0
def blasr(query, target, format, nproc=1, outname="out.m5", consensus=True):
    """
    Simple mapper
    """
    cmd = ("blasr %s %s %s --nproc %d --bestn 1 --out %s ") \
           % (query, target, format, nproc, outname)
    #need to figure out how to m5-pie it...maybe
    if consensus:
        r, o, e = exe(cmd + BLASRPARAMS)
    else:
        r, o, e = exe(cmd + EEBLASRPARAMS)
    logging.debug("blasr - %d - %s - %s" % (r, o, e))
Пример #3
0
def samToFastq( inSam, outFq ):
    """
    Creates input.fastq from SAM file
    """
    return exe(('grep -v "^@" %s | '
                'awk \'{print "@"  $1  "\\n"  $10 "\\n+\\n"  $11}\' '
                '> %s') % (inSam, outFq))
Пример #4
0
    def parseArgs(self):
        """
        Uses OptionParser to parse out input
        Jelly.py <stage> <protocol>
        """
        parser = OptionParser(USAGE)
        parser.remove_option("-h")
        parser.add_option("-h", "--help", action="store_true", default=False)

        parser.add_option("--debug", action="store_true", default=False)
        parser.add_option("-x", dest="extras", type="string", default="", \
                help="-x \"<options>\" are options to pass into the stage you're running")

        self.options, args = parser.parse_args()

        if self.options.help == True:
            if len(args) == 1:
                if args[0] in STAGES:
                    print(exe(Stages.PRINT_HELPS[args[0]])[1])
                    sys.exit(0)
                #Else, this will drop down to the next parser.error
            else:
                print(parser.format_help())
                sys.exit(0)
        if len(args) != 2 or args[0] not in STAGES:
            parser.error("Invalid Arguments. Expected one of\n'%s'" %
                         "', '".join(STAGES))
            sys.exit(1)
        self.executeStage = args[0]
        self.protocolName = os.path.abspath(args[1])
Пример #5
0
def blasr(query, target, nproc=1, outname="out.m5"):
    """
    Simple overlapper
    """
    r, o, e = exe(("blasr %s %s -m 5 --bestn 200 --nCandidates 200 --minMatch 12 "
                 "--affineExtend 3 --nproc %d --noSplitSubreads --out %s --maxScore -1000") % \
                 (query, target, nproc, outname))
Пример #6
0
def remapReads(reads, outName):
    """
    remaps reads to the provided reference (only setup for hg19 -- see 
    global variable reference)
    """
    return exe("blasr {0} {1} -sa {1}.sa -nproc 4 -out {2} -sam -bestn 1"\
               .format(reads, reference, outName))
Пример #7
0
def callBlasr(inFile, refFile, params, nproc=1, outFile="map.sam"):
    """
    fq = input file
    automatically search for .sa
    """
    if os.path.exists(refFile+".sa"):
        sa = "-sa " + refFile + ".sa"
    else:
        sa = ""
    logging.info("Running Blasr")
    cmd = ("blasr %s %s %s -nproc %d -bestn 1 "
           "-sam -clipping subread -out %s ") \
           % (inFile, refFile, sa, nproc, outFile)
    r, o, e = exe(cmd + params)
    
    #r,o,e = exe(("blasr %s %s %s -nproc %d -sam -bestn 1 -nCandidates 20 "
                 #"-out %s -clipping soft -minPctIdentity 75 "
                 #" -noSplitSubreads") % (fq, ref, sa, nproc, out))
    
    if r != 0:
        logging.error("blasr mapping failed!")
        logging.error("RETCODE %d" % (r))
        logging.error("STDOUT %s" % (str(o)))
        logging.error("STDERR %s" % (str(e)))
        logging.error("Exiting")
        exit(r)
    
    logging.info(str([r, o, e]))
Пример #8
0
def remapReads( reads, outName):
    """
    remaps reads to the provided reference (only setup for hg19 -- see 
    global variable reference)
    """
    return exe("blasr {0} {1} -sa {1}.sa -nproc 4 -out {2} -sam -bestn 1"\
               .format(reads, reference, outName))
Пример #9
0
def samToFastq(inSam, outFq):
    """
    Creates input.fastq from SAM file
    """
    return exe(('grep -v "^@" %s | '
                'awk \'{print "@"  $1  "\\n"  $10 "\\n+\\n"  $11}\' '
                '> %s') % (inSam, outFq))
Пример #10
0
def sam2bam( fn ):
    """
    Creates BAM from SAM (only setup for hg19 -- see global variable reference)
    """
    name = fn[:-4]
    return exe(("samtools view -bt {0} {1} | samtools sort - {2}.sort && "
                "mv {2}.sort.bam {2}.bam && "
                "samtools index {2}.bam").format(reference, fn, name))
Пример #11
0
def blasr(query, target, format, nproc=1, outname="out.m5", consensus=True):
    """
    Simple mapper
    """
    cmd = ("blasr %s %s %s -nproc %d -bestn 1 -out %s ") \
           % (query, target, format, nproc, outname)
    #need to figure out how to m5-pie it...maybe
    if consensus:
        r, o, e = exe(cmd + " -noSplitSubreads -minMatch 5 " + \
                     "-nCandidates 20 -sdpTupleSize 6 -insertion 1 -deletion 1 -bestn 1")
    else:
        r, o, e = exe(cmd + " -maxAnchorsPerPosition 100 "
                      "-affineAlign -affineOpen 100 -affineExtend 0 "
                      "-insertion 10 -deletion 10 "
                      "-noSplitSubreads -nCandidates 20 ")

    logging.debug("blasr - %d - %s - %s" % (r, o, e))
Пример #12
0
def sam2bam(fn):
    """
    Creates BAM from SAM (only setup for hg19 -- see global variable reference)
    """
    name = fn[:-4]
    return exe(("samtools view -bt {0} {1} | samtools sort - {2}.sort && "
                "mv {2}.sort.bam {2}.bam && "
                "samtools index {2}.bam").format(reference, fn, name))
Пример #13
0
def blasr(query, target, format, nproc = 1, outname = "out.m5", consensus=True):
    """
    Simple mapper
    """
    cmd = ("blasr %s %s %s -nproc %d -bestn 1 -out %s ") \
           % (query, target, format, nproc, outname)
    #need to figure out how to m5-pie it...maybe
    if consensus:
        r, o, e = exe(cmd + " -noSplitSubreads -minMatch 5 " + \
                     "-nCandidates 20 -sdpTupleSize 6 -insertion 1 -deletion 1 -bestn 1")
    else:
        r, o, e = exe(cmd + " -maxAnchorsPerPosition 100 "
               "-affineAlign -affineOpen 100 -affineExtend 0 "
               "-insertion 10 -deletion 10 "
               "-noSplitSubreads -nCandidates 20 ")
        
    logging.debug("blasr - %d - %s - %s" % (r, o, e))
Пример #14
0
def mapTails(fq, ref, nproc=1, out="tailmap.sam", useSa=True):
    """
    automatically search for .sa
    """
    if os.path.exists(ref + ".sa") and useSa:
        sa = "--sa " + ref + ".sa"
    else:
        sa = ""
    cmd = ("blasr %s %s %s --nproc %d -m 4 --bestn 1 --nCandidates 20 --out %s"
           " --minPctIdentity 75 --sdpTupleSize 6 --noSplitSubreads") \
           % (fq, ref, sa, nproc, out)

    logging.debug(cmd)
    r, o, e = exe(cmd)
    if r != 0:
        logging.error("blasr mapping failed!")
        logging.error("RETCODE %d" % (r))
        logging.error("STDOUT %s" % (str(o)))
        logging.error("STDERR %s" % (str(e)))
        logging.error("Exiting")
        exit(r)

    logging.info(str([r, o, e]))
Пример #15
0
def mapTails(fq, ref, nproc=1, out="tailmap.sam", useSa=True):
    """
    automatically search for .sa
    """
    if os.path.exists(ref+".sa") and useSa:
        sa = "--sa " + ref + ".sa"
    else:
        sa = ""
    cmd = ("blasr %s %s %s --nproc %d -m 4 --bestn 1 --nCandidates 20 --out %s"
           " --minPctIdentity 75 --sdpTupleSize 6 --noSplitSubreads") \
           % (fq, ref, sa, nproc, out)
    
    logging.debug(cmd)
    r,o,e = exe(cmd)
    if r != 0:
        logging.error("blasr mapping failed!")
        logging.error("RETCODE %d" % (r))
        logging.error("STDOUT %s" % (str(o)))
        logging.error("STDERR %s" % (str(e)))
        logging.error("Exiting")
        exit(r)
    
    logging.info(str([r, o, e]))
Пример #16
0
 def consensusCalling(self, spot, bam, reference, args):
     """
     Make a consensus of all the reads in the region and identify all of the SVs in the region
     """
     #
     MAXNUMREADS = 100 #I don't think we'll need more than this many reads
     MAXATTEMPTS = MAXNUMREADS/2 #I don't feel like trying 100 times
     SPANBUFFER = 100 #number of bases I want a read to span
     
     chrom, start, end = spot.chrom, spot.start, spot.end
     buffer = args.buffer
     
     supportReads = []
     spanReads = []
     #Fetch reads and trim
     totCnt = 0
     for read in bam.fetch(chrom, max(0, start-buffer-SPANBUFFER), end+buffer+SPANBUFFER):
         if read.qname not in spot.varReads:
             continue
         seq, qual = self.readTrim(read, start-buffer, end+buffer)
         if read.pos < start-SPANBUFFER and read.aend > end+SPANBUFFER:
             spanReads.append((len(seq), seq, qual))
         else:
             supportReads.append((seq, qual))
         totCnt += 1
         
     if len(spanReads) == 0:
         logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
         spot.tags["noSpan"] = True
         return [spot]
         
     spanReads.sort(reverse=True)
     if len(spanReads) > MAXNUMREADS:
         origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
     elif len(spanReads) + len(supportReads) > MAXNUMREADS:
         origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads[:MAXNUMREADS-len(spanReads)]
     else:
         origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
     logging.debug("Alt reads: %d total, %d extra support" % (totCnt, len(origSupportReads)))
     
     mySpots = []
     refReadId = 0
     haveVar = False
     
     #Attempt each spanRead until we get one that passes
     #while refReadId < len(spanReads) and not haveVar and refReadId < MAXATTEMPTS:
     #refread = spanReads[refReadId]
     #supportReads = origSupportReads[:refReadId] + origSupportReads[refReadId+1:] 
     refReadId += 1
         
     #read that spans most of the region goes first
     #use the rest for cleaning
 
     #building consensus sequence
     foutreads = NamedTemporaryFile(suffix=".fasta")
     qoutreads = open(foutreads.name + '.qual', 'w')
     for id, i in enumerate(origSupportReads):
         foutreads.write(">%d\n%s\n" % (id, i[0]))
         qoutreads.write(">%d\n%s\n" % (id, " ".join(str(ord(j)-33) for j in i[1])))
     foutreads.flush()
     qoutreads.flush()
     
     #foutref = NamedTemporaryFile(suffix=".fasta")
     #foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1]))
     #foutref.flush()
     
     logging.debug("Making the contig....")
     #run it through phrap
     #make out.fasta and out.fasta.qual
     #run phrap
     #if asm -- consensus only 
     r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (foutreads.name), timeout=3)
     
     if r != 0:#failed
         logging.warning('phrap failed ' + self.name)
         logging.warning(o)
         logging.warning(e)
         return []  #here is where I'd like to add just the no-consensus spot
     
     
     results = mergeFastaQual(foutreads.name + ".contigs", foutreads.name + ".contigs.qual")
     if len(results) == 0:
         logging.warning('no asm made ' + self.name)
         return [] #here is where I'd like to add just the no-consensus spot
     logging.info('%d contigs made %s' % (len(results), self.name))
     
     #then run it through consensus
     logging.debug("Polishing contigs")
     
     alignOut = NamedTemporaryFile(suffix=".m5")
     blasr(foutreads.name, foutreads.name + ".contigs", format="-m 5", nproc=1, outname=alignOut.name)
     # elif no asm and consensus only (faster)
     
     if args.polish == "pbbanana":
         aligns = M5File(alignOut.name)
         con = ">con\n%s\n" % consensus(aligns).sequence
         conName = "pbbanana"
     elif args.polish == "pbdagcon":
         logging.debug("pbdagcon is running")
         #using minerrreads - 1 because one f them is already being used as seed!
         r, con, e = exe("pbdagcon -c %d -t 0 %s" % (max(0, args.minErrReads - 1), alignOut.name), timeout=1)
         #r, con, e = exe("pbdagcon %s" % (alignOut.name), timeout=2)
         logging.debug("back from pbdagcon")
         logging.debug((r,e))
         #raw_input("press ent")
         if con is not None:
             con = con[con.index("\n")+1:]
         else:
             con = ""
         conName = "pbdagcon"
     alignOut.close()
     #foutref.close()
     foutreads.close()
     #we don't have a consensus - retry
     if len(con) == 0:
         logging.debug("Trying another seed read for consensus")
         con = results.values()[0].seq
     logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))
     
     #try improving consensus
     conOut = NamedTemporaryFile(suffix=".fasta")
     conOut.write(con)
     #conOut.close()
     conOut.flush()
     
     refOut = NamedTemporaryFile(suffix=".fasta")
     #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
     #fout = open("f****e.ref.fasta",'w')
     #fout.write(j)
     #fout.close()
     refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                 reference.fetch(chrom, max(0, start-buffer), end+buffer)))
     refOut.flush()
     
     #map consensus to refregion
     varSam = NamedTemporaryFile(suffix=".sam")
     blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name)
         #consensus=False) -- would this help?
         #or what if I fed it through leftalign?
     
     sam = pysam.Samfile(varSam.name)
     
     matches = 0.0
     bases = 0.0
     nReads = 0
     mySpots = []
     for read in sam:
         nReads += 1
         spot.tags["consensusCreated"] = True
         for svstart, svsize, svtype, altseq in expandCigar(read, args.minIndelSize, CONFIRMCOLLAPSE, True):
             newspot = copy.deepcopy(spot)
             
             if spot.svtype == svtype and svtype == "INS":
                 haveVar = True
                 newspot.start = svstart + start - buffer
                 newspot.end = svstart + start - buffer
                 newspot.tags["seq"] = altseq
                 newspot.size = svsize
                 gt, gq = genotype(newspot)
                 newspot.tags["GT"] = gt
                 newspot.tags["GQ"] = gq
                 mySpots.append(newspot)
             
             elif spot.svtype == svtype and svtype == "DEL":
                 haveVar = True
                 newspot.start = svstart + start - buffer
                 newspot.end = svstart + svsize + start - buffer
                 newspot.size = -svsize
                 gt, gq = genotype(newspot)
                 newspot.tags["GT"] = gt
                 newspot.tags["GQ"] = gq
                 newspot.tags["seq"] = reference.fetch(chrom, newspot.start, newspot.end)
                 mySpots.append(newspot)
     #identity = matches/bases
     #If no var, nothing is returned.
     #for newspot in mySpots:
         #newspot.tags["alnIdentityEstimate"] = identity
         #Keep reporting the actual contigs out until we 
         #find a reason to need it (and also we can get quals...)
         #vbam.reset()
         #for id, read in enumerate(vbam):
             #newspot.tags["contigSeq%d" % (id)] = read.seq 
             #newspot.tags["contigQual%d" % (id)] = read.qual 
     
     #vbam.close()
     #varBam.close()
     refOut.close()
     
     logging.debug("%d consensus reads created %d spots" % (nReads, len(mySpots)))
         
     return mySpots
Пример #17
0
    def consensusCalling(self, spot, bam, reference, args):
        """
        Make a consensus of all the reads in the region and identify all of the SVs in the region
        """
        #
        MAXNUMREADS = 100  #I don't think we'll need more than this many reads
        MAXATTEMPTS = 5  #MAXNUMREADS/2 #I don't feel like trying 100 times
        SPANBUFFER = 100  #number of bases I want a read to span

        chrom, start, end = spot.chrom, spot.start, spot.end
        buffer = args.buffer

        supportReads = []
        spanReads = []
        #Fetch reads and trim
        totCnt = 0
        for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER),
                              end + buffer + SPANBUFFER):
            if read.qname not in spot.varReads:
                continue
            seq, qual = self.readTrim(read, start - buffer, end + buffer)
            if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER:
                sz = spot.varReadsSize[spot.varReads.index(read.qname)]
                spanReads.append((abs(sz - spot.tags["szMedian"]), seq, qual))
            else:
                supportReads.append((seq, qual))
            totCnt += 1

        if len(spanReads) == 0:
            logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
            spot.tags["noSpan"] = True
            return [spot]

        #spanReads.sort(reverse=True)
        spanReads.sort()
        if len(spanReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
        elif len(spanReads) + len(supportReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads
                                ] + supportReads[:MAXNUMREADS - len(spanReads)]
        else:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
        mySpots = []
        refReadId = 0
        haveVar = False
        #Attempt each spanRead until we get one that passes
        while refReadId < len(
                spanReads) and not haveVar and refReadId < MAXATTEMPTS:
            refread = spanReads[refReadId]
            supportReads = origSupportReads[:refReadId] + origSupportReads[
                refReadId + 1:]
            refReadId += 1

            #read that spans most of the region goes first
            #use the rest for cleaning

            #building consensus sequence
            foutreads = NamedTemporaryFile(suffix=".fastq")
            for id, i in enumerate(supportReads):
                foutreads.write("@%d\n%s\n+\n%s\n" % (id, i[0], i[1]))
            foutreads.flush()
            foutref = NamedTemporaryFile(suffix=".fasta")
            foutref.write(">%s:%d-%d\n%s" %
                          (spot.chrom, start, end, refread[1]))
            foutref.flush()

            alignOut = NamedTemporaryFile(suffix=".m5")
            logging.debug("making the contig....")
            #run it through phrap

            #then run it through consensus
            blasr(foutreads.name,
                  foutref.name,
                  format="-m 5",
                  nproc=1,
                  outname=alignOut.name)
            if args.consensus == "pbbanana":
                aligns = M5File(alignOut.name)
                con = ">con\n%s\n" % consensus(aligns).sequence
                conName = "pbbanana"
            elif args.consensus == "pbdagcon":
                logging.debug("pbdagcon is running")
                #using minerreads - 1 because one f them is already being used as seed!
                #I want to be sure I get something out... so just require somebody on there
                #r, con, e = exe("pbdagcon -c %d -t 0 %s" % (1, alignOut.name), timeout=1)
                #r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (max(args.minErrReads - 1, 0), alignOut.name), timeout=1)
                r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" %
                                (3, alignOut.name),
                                timeout=1)
                logging.debug("back from pbdagcon")
                logging.debug((r, e))
                #raw_input("press ent")
                if con is not None:
                    con = con[con.index("\n") + 1:]
                else:
                    con = ""
                conName = "pbdagcon"
            alignOut.close()
            foutref.close()
            foutreads.close()
            #we don't have a consensus - retry
            if len(con) == 0:
                logging.debug("Trying another seed read for consensus")
                continue
            logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))

            #try improving consensus
            conOut = NamedTemporaryFile(suffix=".fasta")
            conOut.write(con)
            #conOut.close()
            conOut.flush()

            refOut = NamedTemporaryFile(suffix=".fasta")
            #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
            #fout = open("f****e.ref.fasta",'w')
            #fout.write(j)
            #fout.close()
            refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                        reference.fetch(chrom, max(0, start-(buffer*2)), end+(buffer*2))))
            refOut.flush()

            #map consensus to refregion
            varSam = NamedTemporaryFile(suffix=".sam")
            blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name,\
                consensus=False) #-- would this help?
            #or what if I fed it through leftalign?
            #os.system("cp %s ." % (refOut.name))
            #os.system("cp %s ." % (varSam.name))
            sam = pysam.Samfile(varSam.name)

            matches = 0.0
            bases = 0.0
            nReads = 0
            minVarDiff = 10000
            for read in sam:
                localSpots = []
                nReads += 1
                spot.tags["consensusCreated"] = True
                for svstart, svsize, svtype, altseq in expandCigar(
                        read, args.minIndelSize, CONFIRMCOLLAPSE, True):
                    newspot = copy.deepcopy(spot)

                    if spot.svtype == svtype and svtype == "INS":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer * 2)
                        newspot.end = svstart + start - (buffer * 2)
                        newspot.tags["seq"] = altseq
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        if abs(spot.tags["szMedian"] -
                               newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] -
                                             newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)

                    elif spot.svtype == svtype and svtype == "DEL":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer * 2)
                        newspot.end = svstart + svsize + start - (buffer * 2)
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        newspot.tags["seq"] = reference.fetch(
                            chrom, newspot.start, newspot.end)
                        if abs(spot.tags["szMedian"] -
                               newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] -
                                             newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)
                if len(localSpots) > 0:
                    mySpots.append((minVarDiff, localSpots))

            #identity = matches/bases
            #If no var, nothing is returned.
            #for newspot in mySpots:
            #newspot.tags["alnIdentityEstimate"] = identity
            #Keep reporting the actual contigs out until we
            #find a reason to need it (and also we can get quals...)
            #vbam.reset()
            #for id, read in enumerate(vbam):
            #newspot.tags["contigSeq%d" % (id)] = read.seq
            #newspot.tags["contigQual%d" % (id)] = read.qual

            #vbam.close()
            #varBam.close()
            refOut.close()

            #logging.debug("%d consensus reads created %d spots" % (nReads, len(localSpots)))

        if len(mySpots) == 0:
            return []

        mySpots.sort()
        return mySpots[0][1]
Пример #18
0
#!/usr/bin/env python

from pbsuite.utils.FileHandlers import FastqFile, M5File
from pbsuite.utils.CommandRunner import exe

"""
This can be run inside of an assembly folder and create our filling sequence into
polish.out.fasta
"""
if __name__ == '__main__':
    input = FastqFile("input.fastq")

    fout = open("ref.fasta",'w')
    for i in input.values():
        if i.name.startswith("ref"):
            fout.write(">%s\n%s\n" % (i.name, i.seq))
    fout.close()
    
    print exe(("blasr input.fastq ref.fasta  --bestn 2 -m 5 --noSplitSubreads > out.m5"))
    print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
               "DevJelly/branches/consensusDev/GetSubs.py out.m5 input.fastq"))
    print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
               "DevJelly/branches/sv/pbjPolish.py "
               "reads.fastq seed.fasta -n 4 -l"))

Пример #19
0
#!/usr/bin/python

from pbsuite.utils.FileHandlers import FastqFile, M5File
from pbsuite.utils.CommandRunner import exe

"""
This can be run inside of an assembly folder and create our filling sequence into
polish.out.fasta
"""
if __name__ == '__main__':
    input = FastqFile("input.fastq")

    fout = open("ref.fasta",'w')
    for i in input.values():
        if i.name.startswith("ref"):
            fout.write(">%s\n%s\n" % (i.name, i.seq))
    fout.close()
    
    print exe(("blasr input.fastq ref.fasta  --bestn 2 -m 5 --noSplitSubreads > out.m5"))
    print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
               "DevJelly/branches/consensusDev/GetSubs.py out.m5 input.fastq"))
    print exe(("python /stornext/snfs5/next-gen/scratch/english/Jelly/"
               "DevJelly/branches/sv/pbjPolish.py "
               "reads.fastq seed.fasta -n 4 -l"))

Пример #20
0
def blasr(query, target, nproc=1, bestn=1, outName="map.m5"):
    """
    runs blasr
    """
    r,o,e = exe("blasr %s %s --bestn %d --affineAlign -m 5 --nproc %d --out %s" \
                % (query, target, bestn, nproc, outName))
Пример #21
0
def blasr(query, target, nproc=1, bestn=1, outName="map.m5"):
    """
    runs blasr
    """
    r,o,e = exe("blasr %s %s --bestn %d --affineAlign -m 5 --nproc %d --out %s" \
                % (query, target, bestn, nproc, outName))
Пример #22
0
def bam2sam(fn, outName):
    """
    Turns a bam to a sam
    """
    return exe("samtools view -h %s > %s " % (fn, outName))
Пример #23
0
    def consensusCalling(self, spot, bam, reference, args):
        """
        Make a consensus of all the reads in the region and identify all of the SVs in the region
        """
        #
        MAXNUMREADS = 100 #I don't think we'll need more than this many reads
        MAXATTEMPTS = 5   #MAXNUMREADS/2 #I don't feel like trying 100 times
        SPANBUFFER  = 100 #number of bases I want a read to span
        
        chrom, start, end = spot.chrom, spot.start, spot.end
        buffer = args.buffer
        
        supportReads = []
        spanReads = []
        #Fetch reads and trim
        totCnt = 0
        for read in bam.fetch(chrom, max(0, start-buffer-SPANBUFFER), end+buffer+SPANBUFFER):
            if read.qname not in spot.varReads:
                continue
            seq, qual = self.readTrim(read, start-buffer, end+buffer)
            if read.pos < start-SPANBUFFER and read.aend > end+SPANBUFFER:
                sz = spot.varReadsSize[spot.varReads.index(read.qname)]
                spanReads.append((abs(sz - spot.tags["szMedian"]), seq, qual))
            else:
                supportReads.append((seq, qual))
            totCnt += 1
            
        if len(spanReads) == 0:
            logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
            spot.tags["noSpan"] = True
            return [spot]
            
        #spanReads.sort(reverse=True)
        spanReads.sort()
        if len(spanReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
        elif len(spanReads) + len(supportReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads[:MAXNUMREADS-len(spanReads)]
        else:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
        mySpots = []
        refReadId = 0
        haveVar = False
        #Attempt each spanRead until we get one that passes
        while refReadId < len(spanReads) and not haveVar and refReadId < MAXATTEMPTS:
            refread = spanReads[refReadId]
            supportReads = origSupportReads[:refReadId] + origSupportReads[refReadId+1:] 
            refReadId += 1
            
            #read that spans most of the region goes first
            #use the rest for cleaning
            
            #building consensus sequence
            foutreads = NamedTemporaryFile(suffix=".fastq")
            for id, i in enumerate(supportReads):
                foutreads.write("@%d\n%s\n+\n%s\n" % (id, i[0], i[1]))
            foutreads.flush()
            foutref = NamedTemporaryFile(suffix=".fasta")
            foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1]))
            foutref.flush()
            
            alignOut = NamedTemporaryFile(suffix=".m5")
            logging.debug("making the contig....")
            #run it through phrap

            #then run it through consensus
            blasr(foutreads.name, foutref.name, format="-m 5", nproc=1, outname=alignOut.name)
            if args.consensus == "pbbanana":
                aligns = M5File(alignOut.name)
                con = ">con\n%s\n" % consensus(aligns).sequence
                conName = "pbbanana"
            elif args.consensus == "pbdagcon":
                logging.debug("pbdagcon is running")
                #using minerreads - 1 because one f them is already being used as seed!
                #I want to be sure I get something out... so just require somebody on there
                #r, con, e = exe("pbdagcon -c %d -t 0 %s" % (1, alignOut.name), timeout=1)
                #r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (max(args.minErrReads - 1, 0), alignOut.name), timeout=1)
                r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (3, alignOut.name), timeout=1)
                logging.debug("back from pbdagcon")
                logging.debug((r,e))
                #raw_input("press ent")
                if con is not None:
                    con = con[con.index("\n")+1:]
                else:
                    con = ""
                conName = "pbdagcon"
            alignOut.close()
            foutref.close()
            foutreads.close()
            #we don't have a consensus - retry
            if len(con) == 0:
                logging.debug("Trying another seed read for consensus")
                continue
            logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))
            
            #try improving consensus
            conOut = NamedTemporaryFile(suffix=".fasta")
            conOut.write(con)
            #conOut.close()
            conOut.flush()
            
            refOut = NamedTemporaryFile(suffix=".fasta")
            #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
            #fout = open("f****e.ref.fasta",'w')
            #fout.write(j)
            #fout.close()
            refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                        reference.fetch(chrom, max(0, start-(buffer*2)), end+(buffer*2))))
            refOut.flush()
            
            #map consensus to refregion
            varSam = NamedTemporaryFile(suffix=".sam")
            blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name,\
                consensus=False) #-- would this help?
                #or what if I fed it through leftalign?
            #os.system("cp %s ." % (refOut.name))
            #os.system("cp %s ." % (varSam.name))
            sam = pysam.Samfile(varSam.name)
            
            matches = 0.0
            bases = 0.0
            nReads = 0
            minVarDiff = 10000
            for read in sam:
                localSpots = []
                nReads += 1
                spot.tags["consensusCreated"] = True
                for svstart, svsize, svtype, altseq in expandCigar(read, args.minIndelSize, CONFIRMCOLLAPSE, True):
                    newspot = copy.deepcopy(spot)
                    
                    if spot.svtype == svtype and svtype == "INS":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer*2)
                        newspot.end = svstart + start - (buffer*2)
                        newspot.tags["seq"] = altseq
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        if abs(spot.tags["szMedian"] - newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] - newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)
                    
                    elif spot.svtype == svtype and svtype == "DEL":
                        #haveVar = True
                        newspot.start = svstart + start - (buffer*2)
                        newspot.end = svstart + svsize + start - (buffer*2)
                        newspot.size = svsize
                        gt, gq = genotype(newspot)
                        newspot.tags["GT"] = gt
                        newspot.tags["GQ"] = gq
                        newspot.tags["seq"] = reference.fetch(chrom, newspot.start, newspot.end)
                        if abs(spot.tags["szMedian"] - newspot.size) < minVarDiff:
                            minVarDiff = abs(spot.tags["szMedian"] - newspot.size)
                        if args.reportContig:
                            newspot.tags["contigseq"] = read.seq
                            newspot.tags["contigqual"] = read.qual
                        localSpots.append(newspot)
                if len(localSpots) > 0:
                    mySpots.append((minVarDiff, localSpots))
            
            #identity = matches/bases
            #If no var, nothing is returned.
            #for newspot in mySpots:
                #newspot.tags["alnIdentityEstimate"] = identity
                #Keep reporting the actual contigs out until we 
                #find a reason to need it (and also we can get quals...)
                #vbam.reset()
                #for id, read in enumerate(vbam):
                    #newspot.tags["contigSeq%d" % (id)] = read.seq 
                    #newspot.tags["contigQual%d" % (id)] = read.qual 
            
            #vbam.close()
            #varBam.close()
            refOut.close()
            
            #logging.debug("%d consensus reads created %d spots" % (nReads, len(localSpots)))

        if len(mySpots) == 0:
            return []
        
        mySpots.sort()
        return mySpots[0][1]
Пример #24
0
 def assemble(inputFq, workDir):
     return exe("OLCAssembly.py %s --nproc 4 --fqOut --workDir %s" %
                (inputFq, workDir))
Пример #25
0
def pileup(bam):
    """
    create a pileup from the bam
    """
    return exe("samtools mpileup -f {0} {1} > {1}.plup".format(reference, bam))
Пример #26
0
def grabReads( inputBam, entry, outFn ):
    """
    Gets all of the reads for a region and puts them into outFn
    """
    return exe("samtools view -h %s %s > %s" % (inputBam, entry.region, outFn))
Пример #27
0
def bam2sam( fn, outName):
    """
    Turns a bam to a sam
    """
    return exe("samtools view -h %s > %s " % (fn, outName))
Пример #28
0
    def __assemble(self):
        """
        writes temp files
        assembles
        reads results
        clears temp files
        returns results as a string
        Calls the assembler
        """
        self.myTmpFiles = []
        #Temporary Files
        fout = tempfile.NamedTemporaryFile(prefix="spades_pe1",
                                           suffix=".fastq",
                                           delete=False,
                                           dir=self.tmpDir,
                                           mode="w")
        self.myTmpFiles.append(fout.name)
        for name, seq, qual in self.leftReads:
            fout.write("@%s\n%s\n+\n%s\n" % (name, seq, qual))
        fout.close()

        fout2 = tempfile.NamedTemporaryFile(prefix="spades_pe2",
                                            suffix=".fastq",
                                            delete=False,
                                            dir=self.tmpDir,
                                            mode="w")
        self.myTmpFiles.append(fout2.name)
        for name, seq, qual in self.rightReads:
            fout2.write("@%s\n%s\n+\n%s\n" % (name, seq, qual))
        fout2.close()

        foutp = tempfile.NamedTemporaryFile(prefix="spades_pb",
                                            suffix=".fastq",
                                            delete=False,
                                            dir=self.tmpDir,
                                            mode="w")
        self.myTmpFiles.append(foutp.name)
        for name, seq, qual in self.pbReads:
            foutp.write("@%s\n%s\n+\n%s\n" % (name, seq, qual))
        foutp.close()

        #working here
        resultOut = tempfile.mkdtemp(prefix="spades", dir=self.tmpDir)

        estSize = self.buffer * 2
        if self.data.rest[0] != 'DEL':
            estSize += int(self.data.rest[1])

        #r, o, e = exe("dipspades.py -1 {pe1} -2 {pe2} --pacbio {pacbio} -o {output} "\
        r, o, e = exe("spades.py -1 {pe1} -2 {pe2} --pacbio {pacbio} -o {output} "\
                      .format(pe1=fout.name, pe2=fout2.name, pacbio=foutp.name, output=resultOut), \
                      timeout=self.timeout)

        logging.debug("RET - %d\nOUT - %s\nERR- %s" % (r, o, e))
        #just the output dir, maybe?
        self.myTmpFiles.append(resultOut)
        if r == 214:
            super(SpadesAssembler, self).cleanupTmp()
            return "Failure - Assembly Timeout " + self.data.name

        outFsta = os.path.join(resultOut, "dipspades",
                               "consensus_contigs.fasta")
        fasta = FastaFile(outFsta)

        results = {}
        for key in fasta:
            results[key] = FastqEntry(key, fasta[key], '?' * len(fasta[key]))

        #save to file
        fout = tempfile.NamedTemporaryFile(prefix = "asm" + self.data.name, mode="w", \
                                    suffix=".fastq", delete=False, dir=self.tmpDir)
        for key in results:
            fout.write("@group" + self.data.name + "_" + key + "\n" + \
                        results[key].seq + '\n+\n' + \
                        results[key].qual + '\n')

        fout.close()
        self.results = fout.name

        #clean up
        super(SpadesAssembler, self).cleanupTmp()

        return self.results
Пример #29
0
 def assemble(inputFq, workDir):
     return exe("OLCAssembly.py %s --nproc 4 --fqOut --workDir %s" % (inputFq, workDir))
Пример #30
0
    def run(self):
        #Fasta Ref Output
        scaffTempName = self.scaffInput + ".tempFasta"
        scaffOutput = open(scaffTempName, 'w')

        #Qual Ref Output
        if self.qualInput is not None:
            qualTempName = self.qualInput + ".tempQual"
            qualOutput = open(qualTempName, 'w')

        #Gaps Output
        if self.opts.gapOutput is not None:
            gapTableOut = open(self.opts.gapOutput, 'w')
        else:
            gapTableOut = False

        logging.info(
            "Creating reference sequence index names and identifying gaps")

        refTemplate = "ref%07d"
        refId = 1

        #Read References
        reference = FastaFile(self.scaffInput)
        if self.qualInput is not None:
            qualReference = QualFile(self.qualInput)

        for key in reference:

            scaffIndex = refTemplate % refId
            scaffName = key.replace(' ', '_')

            refId += 1

            scaffName = scaffName + "|" + scaffIndex
            scaffOutput.write(">" + scaffName + "\n" + wrap(reference[key]) +
                              "\n")

            if self.qualInput is not None:
                qualOutput.write(">" + scaffName + "\n" +
                                 qwrap(qualReference[key]) + "\n")

            gapCoords = []
            for gap in re.finditer("[^Nn]([Nn]{%d,%s})[^Nn]" % \
                    (self.opts.minGap, self.opts.maxGap), reference[key]):
                gapCoords.append([gap.start() + 1, gap.end() - 1])

            if len(gapCoords) == 0:  #no Gaps
                gapTableOut.write("\t".join(
                    [scaffName, 'na', 'na', scaffIndex + "_0_0", '3']) + '\n')
                logging.debug("Scaffold %s is empty" % scaffName)
                continue

            #Consolidate gaps that are too close -- indicating LQ regions.
            i = 0
            while i < len(gapCoords) - 1:
                if gapCoords[i + 1][0] - gapCoords[i][1] < 25:
                    gapCoords[i + 1][0] = gapCoords[i][0]
                    del (gapCoords[i])
                else:
                    i += 1

            prevEnd = 0  #Contig Start Tracking
            idx = 0
            #Make the first gap
            prevEnd = gapCoords[0][1]
            gapCoords[0][1] - gapCoords[0][0]

            flag = Gap.BEGIN
            if len(gapCoords) == 1:
                flag += Gap.END
            if gapTableOut:
                gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[0][0], gapCoords[0][1], scaffIndex, idx, idx+1, flag))

            #Now Go Through the rest of the gaps
            for i in range(1, len(gapCoords)):
                idx += 1
                prevEnd = gapCoords[i][1]
                gapCoords[i][1] - gapCoords[i][0]

                if gapTableOut:
                    if i == len(gapCoords) - 1:
                        flag = Gap.END
                    else:
                        flag = 0
                    gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[i][0], gapCoords[i][1], scaffIndex, idx, idx+1, flag))

        #Close shop
        scaffOutput.close()
        os.rename(self.scaffInput, self.scaffInput + ".original")
        os.rename(scaffTempName, self.scaffInput)

        if self.qualInput is not None:
            qualOutput.close()
            os.rename(self.qualInput, self.qualInput + ".original")
            os.rename(qualTempName, self.qualInput)

        if gapTableOut:
            gapTableOut.close()

        if self.opts.index:
            logging.info("Creating .sa indexes for references")
            r, o, e = exe("sawriter %s.sa %s" %
                          (self.scaffInput, self.scaffInput))
            if r != 0:
                logging.error("sawriter returned %d" % r)
                logging.error("Ensure it's in your path")
                exit(1)
            logging.debug(str(o) + ' ' + str(e))

        logging.info("Finished!")
Пример #31
0
def grabReads(inputBam, entry, outFn):
    """
    Gets all of the reads for a region and puts them into outFn
    """
    return exe("samtools view -h %s %s > %s" % (inputBam, entry.region, outFn))
Пример #32
0
    def run(self):
        #Fasta Ref Output
        scaffTempName = self.scaffInput+".tempFasta"
        scaffOutput = open(scaffTempName, 'w')
        
        #Qual Ref Output
        if self.qualInput is not None:
            qualTempName= self.qualInput+".tempQual"
            qualOutput = open(qualTempName, 'w')
        
        #Gaps Output
        if self.opts.gapOutput is not None:
            gapTableOut = open(self.opts.gapOutput,'w')
        else:
            gapTableOut = False
        
        logging.info("Creating reference sequence index names and identifying gaps")
        
        refTemplate = "ref%07d"
        refId = 1
        
        #Read References
        reference = FastaFile(self.scaffInput)
        if self.qualInput is not None:
            qualReference = QualFile(self.qualInput)    
        
        for key in reference:
            
            scaffIndex = refTemplate % refId
            scaffName = key.replace(' ','_')
            
            refId += 1
            
            scaffName = scaffName + "|" + scaffIndex
            scaffOutput.write(">"+scaffName+"\n"+wrap(reference[key])+"\n")
            
            if self.qualInput is not None:
                qualOutput.write(">"+scaffName+"\n"+qwrap(qualReference[key])+"\n")
            
            gapCoords = []
            for gap in re.finditer("[^Nn]([Nn]{%d,%s})[^Nn]" % \
                    (self.opts.minGap, self.opts.maxGap), reference[key]):
                gapCoords.append([gap.start() + 1, gap.end() - 1])
            
            if len(gapCoords) == 0:#no Gaps
                gapTableOut.write("\t".join([scaffName, 'na', 'na', scaffIndex+"_0_0", '3'])+'\n')
                logging.debug("Scaffold %s is empty" % scaffName)
                continue
            
            #Consolidate gaps that are too close -- indicating LQ regions.
            i = 0
            while i < len(gapCoords)-1:
                if gapCoords[i+1][0] - gapCoords[i][1] < 25:
                    gapCoords[i+1][0] = gapCoords[i][0]
                    del(gapCoords[i])
                else:
                    i += 1
            
            prevEnd = 0#Contig Start Tracking
            idx = 0
            #Make the first gap
            prevEnd = gapCoords[0][1]
            gapCoords[0][1]-gapCoords[0][0]
            
            flag = Gap.BEGIN
            if len(gapCoords) == 1:
                flag += Gap.END
            if gapTableOut:
                gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[0][0], gapCoords[0][1], scaffIndex, idx, idx+1, flag))

            #Now Go Through the rest of the gaps
            for i in range(1, len(gapCoords)):
                idx += 1
                prevEnd = gapCoords[i][1]
                gapCoords[i][1]-gapCoords[i][0]
            
                if gapTableOut:
                    if i == len(gapCoords)-1:
                        flag = Gap.END
                    else:
                        flag = 0
                    gapTableOut.write("%s\t%i\t%i\t%s_%i_%i\t%d\n" \
                        % (scaffName, gapCoords[i][0], gapCoords[i][1], scaffIndex, idx, idx+1, flag))
            
        #Close shop
        scaffOutput.close()
        os.rename(self.scaffInput, self.scaffInput+".original")
        os.rename(scaffTempName, self.scaffInput)
        
        if self.qualInput is not None:
            qualOutput.close()
            os.rename(self.qualInput, self.qualInput+".original")
            os.rename(qualTempName, self.qualInput)
        
        if gapTableOut:
            gapTableOut.close()
        
        if self.opts.index:
            logging.info("Creating .sa indexes for references")
            r, o, e = exe("sawriter %s.sa %s" % (self.scaffInput, self.scaffInput))
            if r != 0:
                logging.error("sawriter returned %d" % r)
                logging.error("Ensure it's in your path")
                exit(1)
            logging.debug(str(o) + ' ' + str(e))
        
        logging.info("Finished!")
Пример #33
0
def pileup( bam ):
    """
    create a pileup from the bam
    """
    return exe("samtools mpileup -f {0} {1} > {1}.plup".format(reference, bam))
Пример #34
0
    def consensusCalling(self, spot, bam, reference, args):
        """
        Make a consensus of all the reads in the region and identify all of the SVs in the region
        """
        #
        MAXNUMREADS = 100  #I don't think we'll need more than this many reads
        MAXATTEMPTS = MAXNUMREADS / 2  #I don't feel like trying 100 times
        SPANBUFFER = 100  #number of bases I want a read to span

        chrom, start, end = spot.chrom, spot.start, spot.end
        buffer = args.buffer

        supportReads = []
        spanReads = []
        #Fetch reads and trim
        totCnt = 0
        for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER),
                              end + buffer + SPANBUFFER):
            if read.qname not in spot.varReads:
                continue
            seq, qual = self.readTrim(read, start - buffer, end + buffer)
            if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER:
                spanReads.append((len(seq), seq, qual))
            else:
                supportReads.append((seq, qual))
            totCnt += 1

        if len(spanReads) == 0:
            logging.debug("noone spans - consensus aborted. %s" % (str(spot)))
            spot.tags["noSpan"] = True
            return [spot]

        spanReads.sort(reverse=True)
        if len(spanReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]]
        elif len(spanReads) + len(supportReads) > MAXNUMREADS:
            origSupportReads = [(x[1], x[2]) for x in spanReads
                                ] + supportReads[:MAXNUMREADS - len(spanReads)]
        else:
            origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads
        logging.debug("Alt reads: %d total, %d extra support" %
                      (totCnt, len(origSupportReads)))

        mySpots = []
        refReadId = 0
        haveVar = False

        #Attempt each spanRead until we get one that passes
        #while refReadId < len(spanReads) and not haveVar and refReadId < MAXATTEMPTS:
        #refread = spanReads[refReadId]
        #supportReads = origSupportReads[:refReadId] + origSupportReads[refReadId+1:]
        refReadId += 1

        #read that spans most of the region goes first
        #use the rest for cleaning

        #building consensus sequence
        foutreads = NamedTemporaryFile(suffix=".fasta")
        qoutreads = open(foutreads.name + '.qual', 'w')
        for id, i in enumerate(origSupportReads):
            foutreads.write(">%d\n%s\n" % (id, i[0]))
            qoutreads.write(">%d\n%s\n" %
                            (id, " ".join(str(ord(j) - 33) for j in i[1])))
        foutreads.flush()
        qoutreads.flush()

        #foutref = NamedTemporaryFile(suffix=".fasta")
        #foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1]))
        #foutref.flush()

        logging.debug("Making the contig....")
        #run it through phrap
        #make out.fasta and out.fasta.qual
        #run phrap
        #if asm -- consensus only
        r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (foutreads.name),
                      timeout=3)

        if r != 0:  #failed
            logging.warning('phrap failed ' + self.name)
            logging.warning(o)
            logging.warning(e)
            return [
            ]  #here is where I'd like to add just the no-consensus spot

        results = mergeFastaQual(foutreads.name + ".contigs",
                                 foutreads.name + ".contigs.qual")
        if len(results) == 0:
            logging.warning('no asm made ' + self.name)
            return [
            ]  #here is where I'd like to add just the no-consensus spot
        logging.info('%d contigs made %s' % (len(results), self.name))

        #then run it through consensus
        logging.debug("Polishing contigs")

        alignOut = NamedTemporaryFile(suffix=".m5")
        blasr(foutreads.name,
              foutreads.name + ".contigs",
              format="-m 5",
              nproc=1,
              outname=alignOut.name)
        # elif no asm and consensus only (faster)

        if args.polish == "pbbanana":
            aligns = M5File(alignOut.name)
            con = ">con\n%s\n" % consensus(aligns).sequence
            conName = "pbbanana"
        elif args.polish == "pbdagcon":
            logging.debug("pbdagcon is running")
            #using minerrreads - 1 because one f them is already being used as seed!
            r, con, e = exe("pbdagcon -c %d -t 0 %s" %
                            (max(0, args.minErrReads - 1), alignOut.name),
                            timeout=1)
            #r, con, e = exe("pbdagcon %s" % (alignOut.name), timeout=2)
            logging.debug("back from pbdagcon")
            logging.debug((r, e))
            #raw_input("press ent")
            if con is not None:
                con = con[con.index("\n") + 1:]
            else:
                con = ""
            conName = "pbdagcon"
        alignOut.close()
        #foutref.close()
        foutreads.close()
        #we don't have a consensus - retry
        if len(con) == 0:
            logging.debug("Trying another seed read for consensus")
            con = results.values()[0].seq
        logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1])))

        #try improving consensus
        conOut = NamedTemporaryFile(suffix=".fasta")
        conOut.write(con)
        #conOut.close()
        conOut.flush()

        refOut = NamedTemporaryFile(suffix=".fasta")
        #j = reference.fetch(chrom, max(0, start-buffer), end+buffer)
        #fout = open("f****e.ref.fasta",'w')
        #fout.write(j)
        #fout.close()
        refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \
                    reference.fetch(chrom, max(0, start-buffer), end+buffer)))
        refOut.flush()

        #map consensus to refregion
        varSam = NamedTemporaryFile(suffix=".sam")
        blasr(conOut.name, refOut.name, format="--sam", outname=varSam.name)
        #consensus=False) -- would this help?
        #or what if I fed it through leftalign?

        sam = pysam.Samfile(varSam.name)

        matches = 0.0
        bases = 0.0
        nReads = 0
        mySpots = []
        for read in sam:
            nReads += 1
            spot.tags["consensusCreated"] = True
            for svstart, svsize, svtype, altseq in expandCigar(
                    read, args.minIndelSize, CONFIRMCOLLAPSE, True):
                newspot = copy.deepcopy(spot)

                if spot.svtype == svtype and svtype == "INS":
                    haveVar = True
                    newspot.start = svstart + start - buffer
                    newspot.end = svstart + start - buffer
                    newspot.tags["seq"] = altseq
                    newspot.size = svsize
                    gt, gq = genotype(newspot)
                    newspot.tags["GT"] = gt
                    newspot.tags["GQ"] = gq
                    mySpots.append(newspot)

                elif spot.svtype == svtype and svtype == "DEL":
                    haveVar = True
                    newspot.start = svstart + start - buffer
                    newspot.end = svstart + svsize + start - buffer
                    newspot.size = -svsize
                    gt, gq = genotype(newspot)
                    newspot.tags["GT"] = gt
                    newspot.tags["GQ"] = gq
                    newspot.tags["seq"] = reference.fetch(
                        chrom, newspot.start, newspot.end)
                    mySpots.append(newspot)
        #identity = matches/bases
        #If no var, nothing is returned.
        #for newspot in mySpots:
        #newspot.tags["alnIdentityEstimate"] = identity
        #Keep reporting the actual contigs out until we
        #find a reason to need it (and also we can get quals...)
        #vbam.reset()
        #for id, read in enumerate(vbam):
        #newspot.tags["contigSeq%d" % (id)] = read.seq
        #newspot.tags["contigQual%d" % (id)] = read.qual

        #vbam.close()
        #varBam.close()
        refOut.close()

        logging.debug("%d consensus reads created %d spots" %
                      (nReads, len(mySpots)))

        return mySpots
Пример #35
0
 def __assemble(self, reads):
     """
     writes temp files
     assembles
     reads results
     clears temp files
     returns results as a string
     Calls the assembler
     """
     self.myTmpFiles = []
     #Temporary Files
     fout = tempfile.NamedTemporaryFile(suffix=".fasta", mode="w", delete=False, dir=self.tmpDir)
     self.myTmpFiles.append(fout.name)
     qout = open(fout.name + '.qual', 'w')
     self.myTmpFiles.append(fout.name + '.qual')
     
     for name, seq, qual in reads:
         fout.write(">{0}\n{1}\n".format(name, seq))
         qout.write(">{0}\n{1}\n".format(name, qual))
     
     fout.close()
     qout.close()
     r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (fout.name),\
                   timeout=self.timeout)
     self.myTmpFiles.extend([fout.name + ".contigs",  fout.name + ".contigs.qual", \
                        fout.name + ".problems", fout.name + ".problems.qual", \
                        fout.name + ".log",      fout.name + ".singlets"])
     if r == 214:
         super(PhrapAssembler, self).cleanupTmp()
         return "Failure - Assembly Timeout " + self.data.name
      
     results = mergeFastaQual(fout.name + ".contigs", fout.name + ".contigs.qual")
     
     #Try to push the problems through, too
     if os.stat(fout.name + '.problems').st_size != 0:
         pfile = fout.name + ".problems"
         r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (pfile), \
                     timeout=self.timeout)
                 
         self.myTmpFiles.extend([pfile + ".contigs",  pfile + ".contigs.qual", \
                         pfile + ".problems", pfile + ".problems.qual", \
                         pfile + ".log",      pfile + ".singlets"])
         if r == 214:
             super(PhrapAssembler, self).cleanupTmp()
             return "Failure - Assembly Timeout " + self.data.name
         
         results.update(mergeFastaQual(fout.name + ".problems.contigs", fout.name + ".problems.contigs.qual"))
     
     #save to file
     fout = tempfile.NamedTemporaryFile(prefix = "asm" + self.data.name, mode="w",\
                                 suffix=".fastq", delete=False, dir=self.tmpDir)
     for key in results:
         fout.write("@group" + self.data.name + "_" + key + "\n" + \
                     results[key].seq + '\n+\n' + \
                     results[key].qual + '\n')
     fout.close()
     self.results = fout.name
     
     #clean up
     super(PhrapAssembler, self).cleanupTmp()
     
     return self.results