def oldtails(): aligns = M5File(primary) #where I'm putting the good hits mapOut = open(outname, "w") #where I'm putting the tails tfq = NamedTemporaryFile(prefix="tails_", suffix=".fastq", delete=False, dir=basedir) ALLTEMPFILES.append( tfq.name ) whichEnd = defaultdict(list) #extract the tails ntails = 0 for a in aligns: if a.qstart >= MINTAIL: tseq1 = reads[a.qname].subSeq(None, a.qstart) #prolog tseq1.name = "%s_::_5_::_%d,%d" % (tseq1.name, a.qstart, a.qseqlength) tfq.write(str(tseq1)) ntails += 1 if a.qend - a.qseqlength > MINTAIL: tseq2 = reads[a.qname].subSeq(a.qend, None) #epilog tseq2.name = "%s_::_3_::_%d,%d" % (tseq2.name, a.qend, a.qseqlength) tfq.write(str(tseq2)) ntails += 1 mapOut.write(str(a)+"\n") #don't want redundant hits on a single flank whichEnd[a.qname].append(a.tname) tfq.close() logging.info("%d unmapped tails" % (ntails)) #map tails tailAlign = NamedTemporaryFile(prefix="tails_", suffix=".m5", delete=False, dir=basedir) tailAlign = tailAlign.name ALLTEMPFILES.append(tailAlign) blasr(tfq.name, target, nproc=nproc, bestn=1, outname=tailAlign) aligns2 = M5File(tailAlign) logging.info("%d tails mapped" % len(aligns2)) for a in aligns2: #get the carryon info name, direct, se = a.qname.split("_::_") pos, length = map(int, se.split(',')) #correct it's information a.qname = name a.qseqlength = length #prevent redundant flank map if a.tname in whichEnd[a.qname]: logging.info("%s failed ref map" % a.tname) continue whichEnd[a.qname].append(a.tname) #epilogs need to be updated if direct == '3': a.qstart += pos a.qend += pos mapOut.write(str(a)+"\n") mapOut.close() return
def m5ToOvlGraph(readNames, fileName): """ Create the graph """ connector = AlignmentConnector() alignments = M5File(fileName) graph = nx.Graph() #filt = [] #get only the single best alignment between any two reads fdict = {} for align in alignments: if align.qname == align.tname: continue name = [align.qname, align.tname] name.sort() name = ":".join(name) if name in fdict: if align.score < fdict[name].score: fdict[name] = align else: fdict[name] = align alignments = fdict.values() #make edges for all overlaps for align in alignments: if align.qname == align.tname: continue extend = connector.extendsTarget(align) align.support = extend if extend != SUPPORTFLAGS.none: graph.add_edge(align.qname, align.tname, data=align) return graph
def run(argv): print argv args = parseArgs(argv) if args.m4.endswith("m5"): aligns = M5File(args.m4) else: aligns = M4File(args.m4) if args.reads.endswith("fasta"): reads = FastaFile(args.reads) elif args.reads.endswith("fastq"): temp = FastqFile(args.reads) reads = {} for i in temp: reads[i] = temp[i].seq del (temp) else: logging.error("Expected Fasta or Fastq for READS (%s)" % args.reads) exit(1) logging.info("Extracting tails") tailfastq = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False, dir=args.temp) tailfastq.close() tailfastq = tailfastq.name logging.debug("Tail read tmp file %s " % (tailfastq)) r, t, m = extractTails(aligns, reads, outFq=tailfastq, minLength=args.minTail) logging.info("Parsed %d reads" % (r)) logging.info("Found %d tails" % (t)) logging.info("%d reads had double tails" % (m)) if t == 0: logging.info("No tails -- Exiting") exit(0) logging.info("Mapping Tails") tailmap = tempfile.NamedTemporaryFile(suffix=".m4", delete=False, dir=args.temp) tailmap.close() tailmap = tailmap.name logging.debug("Read map tmp file %s " % (tailmap)) mapTails(tailfastq, args.ref, nproc=args.nproc, out=tailmap, useSa=args.noSa) logging.info("Consolidating alignments") logging.debug("Final file %s " % (args.output)) n = uniteTails(aligns, tailmap, args.output, args.inplace) logging.info("%d tails mapped" % (n))
def singleOverlapAssembly(alldata, args): """ """ global ALLTEMPFILES data = alldata.stats reads = NamedTemporaryFile(prefix="sol_", suffix=".fasta", delete=False, dir=args.tempDir) ALLTEMPFILES.append(reads.name) e1Seq = data["extendSeq1"]; e2Seq = data["extendSeq2"] reads.write(">%s\n%s\n>%s\n%s\n" % ("seq1", e1Seq, "seq2", e2Seq)) reads.close() alignFn = NamedTemporaryFile(prefix="sol_",suffix=".m5", delete=False, dir=args.tempDir) ALLTEMPFILES.append(alignFn.name) blasr(reads.name, reads.name, nproc=args.nproc, outname=alignFn.name) aligns = M5File(alignFn) # find best hit between the two connector = AlignmentConnector() bestS = None bestA = 0 for i in aligns: if i.qname != i.tname: if connector.extendsTarget(i): if i.score < bestS: bestA = i bestS = i.score if bestS is None: logging.info("no overlap between extenders") return #any of these steps could fail -- #Ensure the hit is valid #(if + + and sameStrand we are okay, if - + and not sameStrand we are okay) if data["sameStrand"] == (bestA.tstrand == '0'): logging.info("bad overlap between extenders") return con = consensus([bestA]) bestA = bestA[0] #strand correction... if bestA.qname == "seq1": if bestA.tstrand == '1': e2Seq = e2Seq[:bestA.tstart].translate(revComp)[::-1] seq = e1Seq[:bestA.qstart] + con.sequence.translate(revComp)[::-1] + e2Seq else: seq = e1Seq[:bestA.qstart] + con.sequence + e2Seq[bestA.tend:] else: if bestA.tstrand == '1': e2Seq = e2Seq[:bestA.qstart].translate(revComp)[::-1] seq = e1Seq[:bestA.tstart] + con.sequence + e2Seq else: seq = e1Seq[:bestA.qstart] + con.sequence + e2Seq[bestA.tstart:] return seq
def consensusCalling(self, spot, bam, reference, args): """ Make a consensus of all the reads in the region and identify all of the SVs in the region """ # MAXNUMREADS = 100 #I don't think we'll need more than this many reads MAXATTEMPTS = MAXNUMREADS / 2 #I don't feel like trying 100 times SPANBUFFER = 100 #number of bases I want a read to span chrom, start, end = spot.chrom, spot.start, spot.end buffer = args.buffer supportReads = [] spanReads = [] #Fetch reads and trim totCnt = 0 for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER), end + buffer + SPANBUFFER): if read.qname not in spot.varReads: continue seq, qual = self.readTrim(read, start - buffer, end + buffer) if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER: spanReads.append((len(seq), seq, qual)) else: supportReads.append((seq, qual)) totCnt += 1 if len(spanReads) == 0: logging.debug("noone spans - consensus aborted. %s" % (str(spot))) spot.tags["noSpan"] = True return [spot] spanReads.sort(reverse=True) if len(spanReads) > MAXNUMREADS: origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]] elif len(spanReads) + len(supportReads) > MAXNUMREADS: origSupportReads = [(x[1], x[2]) for x in spanReads ] + supportReads[:MAXNUMREADS - len(spanReads)] else: origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads logging.debug("Alt reads: %d total, %d extra support" % (totCnt, len(origSupportReads))) mySpots = [] refReadId = 0 haveVar = False #Attempt each spanRead until we get one that passes #while refReadId < len(spanReads) and not haveVar and refReadId < MAXATTEMPTS: #refread = spanReads[refReadId] #supportReads = origSupportReads[:refReadId] + origSupportReads[refReadId+1:] refReadId += 1 #read that spans most of the region goes first #use the rest for cleaning #building consensus sequence foutreads = NamedTemporaryFile(suffix=".fasta") qoutreads = open(foutreads.name + '.qual', 'w') for id, i in enumerate(origSupportReads): foutreads.write(">%d\n%s\n" % (id, i[0])) qoutreads.write(">%d\n%s\n" % (id, " ".join(str(ord(j) - 33) for j in i[1]))) foutreads.flush() qoutreads.flush() #foutref = NamedTemporaryFile(suffix=".fasta") #foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1])) #foutref.flush() logging.debug("Making the contig....") #run it through phrap #make out.fasta and out.fasta.qual #run phrap #if asm -- consensus only r, o, e = exe("phrap %s -minmatch 6 -minscore 20" % (foutreads.name), timeout=3) if r != 0: #failed logging.warning('phrap failed ' + self.name) logging.warning(o) logging.warning(e) return [ ] #here is where I'd like to add just the no-consensus spot results = mergeFastaQual(foutreads.name + ".contigs", foutreads.name + ".contigs.qual") if len(results) == 0: logging.warning('no asm made ' + self.name) return [ ] #here is where I'd like to add just the no-consensus spot logging.info('%d contigs made %s' % (len(results), self.name)) #then run it through consensus logging.debug("Polishing contigs") alignOut = NamedTemporaryFile(suffix=".m5") blasr(foutreads.name, foutreads.name + ".contigs", format="-m 5", nproc=1, outname=alignOut.name) # elif no asm and consensus only (faster) if args.polish == "pbbanana": aligns = M5File(alignOut.name) con = ">con\n%s\n" % consensus(aligns).sequence conName = "pbbanana" elif args.polish == "pbdagcon": logging.debug("pbdagcon is running") #using minerrreads - 1 because one f them is already being used as seed! r, con, e = exe("pbdagcon -c %d -t 0 %s" % (max(0, args.minErrReads - 1), alignOut.name), timeout=1) #r, con, e = exe("pbdagcon %s" % (alignOut.name), timeout=2) logging.debug("back from pbdagcon") logging.debug((r, e)) #raw_input("press ent") if con is not None: con = con[con.index("\n") + 1:] else: con = "" conName = "pbdagcon" alignOut.close() #foutref.close() foutreads.close() #we don't have a consensus - retry if len(con) == 0: logging.debug("Trying another seed read for consensus") con = results.values()[0].seq logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1]))) #try improving consensus conOut = NamedTemporaryFile(suffix=".fasta") conOut.write(con) #conOut.close() conOut.flush() refOut = NamedTemporaryFile(suffix=".fasta") #j = reference.fetch(chrom, max(0, start-buffer), end+buffer) #fout = open("f****e.ref.fasta",'w') #fout.write(j) #fout.close() refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \ reference.fetch(chrom, max(0, start-buffer), end+buffer))) refOut.flush() #map consensus to refregion varSam = NamedTemporaryFile(suffix=".sam") blasr(conOut.name, refOut.name, format="--sam", outname=varSam.name) #consensus=False) -- would this help? #or what if I fed it through leftalign? sam = pysam.Samfile(varSam.name) matches = 0.0 bases = 0.0 nReads = 0 mySpots = [] for read in sam: nReads += 1 spot.tags["consensusCreated"] = True for svstart, svsize, svtype, altseq in expandCigar( read, args.minIndelSize, CONFIRMCOLLAPSE, True): newspot = copy.deepcopy(spot) if spot.svtype == svtype and svtype == "INS": haveVar = True newspot.start = svstart + start - buffer newspot.end = svstart + start - buffer newspot.tags["seq"] = altseq newspot.size = svsize gt, gq = genotype(newspot) newspot.tags["GT"] = gt newspot.tags["GQ"] = gq mySpots.append(newspot) elif spot.svtype == svtype and svtype == "DEL": haveVar = True newspot.start = svstart + start - buffer newspot.end = svstart + svsize + start - buffer newspot.size = -svsize gt, gq = genotype(newspot) newspot.tags["GT"] = gt newspot.tags["GQ"] = gq newspot.tags["seq"] = reference.fetch( chrom, newspot.start, newspot.end) mySpots.append(newspot) #identity = matches/bases #If no var, nothing is returned. #for newspot in mySpots: #newspot.tags["alnIdentityEstimate"] = identity #Keep reporting the actual contigs out until we #find a reason to need it (and also we can get quals...) #vbam.reset() #for id, read in enumerate(vbam): #newspot.tags["contigSeq%d" % (id)] = read.seq #newspot.tags["contigQual%d" % (id)] = read.qual #vbam.close() #varBam.close() refOut.close() logging.debug("%d consensus reads created %d spots" % (nReads, len(mySpots))) return mySpots
#!/usr/bin/python import sys from pbsuite.utils.FileHandlers import M4File, M5File if __name__ == '__main__': try: fn = sys.argv[1] except: sys.stderr.write(("Error! Expected One Argument, " \ "an m4 or m5 alignment file\n")) exit(1) if fn.endswith('.m4'): file = M4File(sys.argv[1]) elif fn.endswith('.m5'): file = M5File(sys.argv[1]) else: print "Unrecognized File Type (expecting .m4 or .m5)" exit(1) if len(sys.argv) == 3: out = open(sys.argv[2], 'w') else: out = sys.stdout out.write("\n".join(map(lambda x: x.toBed(), file)) + "\n")
if __name__ == '__main__': args = parseArgs() alignFile = args.outname+".m5" consensusFile = args.outname+".fasta" #extract the read I'm looking for if args.target is not None:#Name tempOut = open("temp.fasta",'w') fasta = FastaFile(args.reads) tempOut.write(">%s\n%s\n" % (args.target, fasta[args.target])) tempOut.write blasr(args.reads, tempOut.name, nproc=args.nproc, outName=alignFile) aligns = M5File(alignFile) fout = open(consensusFile, 'w') results = consensus(aligns) fout.write(">pbjpolish_%d_vote_%d_len\n" % (results.contribBases,\ results.fillBases, results.sequence)) #fout.write(">\n%s\n" % consensus(aligns)) fout.close() elif args.Target is not None:#File blasr(args.reads, args.Target, nproc=args.nproc, outName=alignFile) aligns = M5File(alignFile) fout = open(consensusFile, 'w') results = consensus(aligns) fout.write(">pbjpolish_%d_vote_%d_len\n%s\n" % (results.contribBases,\ results.fillBases, results.sequence))
def preunitereads(inputFastq, args): """ sent query, I'm going to pop all of the united reads onto this """ global ALLTEMPFILES alignFile = NamedTemporaryFile(prefix="uni_", suffix=".m5", delete=False, dir=args.tempDir).name ALLTEMPFILES.append(alignFile) readFile = NamedTemporaryFile(prefix="uni_", suffix=".fasta", delete=False, dir=args.tempDir) ALLTEMPFILES.append(readFile.name) input = FastqFile(inputFastq) for read in input: readFile.write(">%s\n%s\n" % (input[read].name, input[read].seq)) readFile.close() readFile = readFile.name blasr(readFile, readFile, bestn=5, nCandidates=20, nproc=args.nproc, outname=alignFile) aligns = M5File(alignFile) con = AlignmentConnector() extenders = [] for a in aligns: if a.tname == a.qname: continue if a.qstart - a.qend < 500 or a.tstart - a.tend < 500: continue sup = con.extendsTarget(a, minCovers=500) #sup = con.extendsTarget(a, minCovers=100) a.support = sup if sup in [SUPPORTFLAGS.left, SUPPORTFLAGS.right]: extenders.append(a) best = {} #best of queries for i in extenders: score = 0 if i.qname in best: score = best[i.qname].score if i.score < score: best[i.qname] = i #print "q" #for i in best.values(): #print str(i) best2 = {} #best of targets for i in best.values(): score = 0 if i.tname in best2: score = best2[i.tname].score if i.score < score: best2[i.tname] = i #print "t" #for i in best2.values(): #print str(i) best3 = {} #best of both for i in best2.values(): keys = [i.qname, i.tname] keys.sort() keys = "".join(keys) score = 0 if keys in best3: score = best3[keys].score if i.score < score: best3[keys] = i #print 'b' #for i in best3.values(): #print str(i) reads = FastqFile(inputFastq) fout = open(inputFastq, 'a') count = 0 for i in best3.values(): qseq = None if i.support == SUPPORTFLAGS.left: if i.qstrand == '0': qseq = reads[i.qname].seq + reads[i.tname].seq[i.tend:] elif i.qstrand == '1': qseq = reads[i.qname].seq + reads[ i.tname].seq[i.tend:].translate(revComp) if i.support == SUPPORTFLAGS.right: if i.qstrand == '0': qseq = reads[i.tname].seq[:i.tstart] + reads[i.qname].seq elif i.qstrand == '1': qseq = reads[i.tname].seq[:i.tstart].translate( revComp) + reads[i.qname].seq if qseq is not None: count += 1 fout.write("@%s_%s\n%s\n+\n%s\n" % (i.qname, i.tname, qseq, "!" * len(qseq))) logging.info("Preunited %d reads" % (count)) fout.close()
def buildFillSeq(data, inputReads, args): """ Using all of the information in the namedtuple returned from getSubSeqs, go through the process of building the filling sequence. load the filling sequence in to the data """ #try to build span if SUPPORTFLAGS.span in data.stats["support"][0]: logging.debug("build span") alignFile = NamedTemporaryFile(prefix="scon_", suffix=".m5", delete=False, dir=args.tempDir) alignFile.close() alignFile = alignFile.name ALLTEMPFILES.append(alignFile) #blasr(data.spanReads, data.spanSeed, bestn = 1, nproc = args.nproc, outname=alignFile) blasr(inputReads, data.spanSeed, bestn=1, nproc=args.nproc, outname=alignFile) aligns = M5File(alignFile) if len(aligns) > 0: con = consensus(aligns) #if successful we're done if con.contribBases > 0 and con.fillBases > 0: #must be sequence = con.sequence #strandCorrector(data.stats["spanSeedStrand1"], con.sequence) data.stats["fillSeq"] = sequence data.stats["contribSeqs"] = con.contribSeqs data.stats["contribBases"] = con.contribBases data.stats["fillBases"] = con.fillBases return else: logging.info("no mapping... picking span seq") sequence = FastaFile(data.spanSeed).values()[0] data.stats["fillSeq"] = sequence data.stats["contribSeqs"] = 1 data.stats["contribBases"] = len(sequence) data.stats["fillBases"] = len(sequence) return #no span -- we need to do flanks flank1Success = False flank2Success = False logging.debug(json.dumps(data.stats, indent=4)) fl1Flag = SUPPORTFLAGS.left if data.stats["seed1"].endswith( "e5") else SUPPORTFLAGS.right if data.stats["seed2"] is not None: fl2Flag = SUPPORTFLAGS.left if data.stats["seed2"].endswith( "e5") else SUPPORTFLAGS.right else: fl2Flag = None logging.debug((fl1Flag, fl2Flag)) if fl1Flag in data.stats["support"][1]: logging.debug("build flank1 %d" % fl1Flag) alignFile = NamedTemporaryFile(prefix="f1con_", suffix=".m5", delete=False, dir=args.tempDir) alignFile.close() alignFile = alignFile.name ALLTEMPFILES.append(alignFile) #blasr(data.flank1Reads, data.flank1Seed, bestn=1, nproc=args.nproc, outname=alignFile) blasr(inputReads, data.flank1Seed, bestn=1, nproc=args.nproc, outname=alignFile) aligns = M5File(alignFile) if len(aligns) > 0: con = consensus(aligns) if con.contribBases > 0 and con.fillBases > 0: #must be sequence = con.sequence #strandCorrector(data.stats["extendF1SeedStrand"], con.sequence) data.stats["extendSeq1"] = sequence data.stats["contribSeqs"] += con.contribSeqs data.stats["contribBases"] += con.contribBases data.stats["fillBases"] += con.fillBases flank1Success = True else: logging.info("no mapping... picking f1 seq") sequence = FastaFile(data.flank1Seed).values()[0] data.stats["extendSeq1"] = sequence data.stats["contribSeqs"] = 1 data.stats["contribBases"] = len(sequence) data.stats["fillBases"] = len(sequence) flank1Success = True if fl2Flag in data.stats["support"][2]: logging.debug("build flank2 %d" % fl2Flag) alignFile = NamedTemporaryFile(prefix="f2con_", suffix=".m5", delete=False, dir=args.tempDir) alignFile.close() alignFile = alignFile.name ALLTEMPFILES.append(alignFile) #blasr(data.flank2Reads, data.flank2Seed, bestn=1, nproc=args.nproc, outname=alignFile) blasr(inputReads, data.flank2Seed, bestn=1, nproc=args.nproc, outname=alignFile) aligns = M5File(alignFile) if len(aligns) > 0: con = consensus(aligns) if con.contribBases > 0 and con.fillBases > 0: #must be sequence = con.sequence #strandCorrector(data.stats["extendF2SeedStrand"], con.sequence) data.stats["extendSeq2"] = sequence data.stats["contribSeqs"] += con.contribSeqs data.stats["contribBases"] += con.contribBases data.stats["fillBases"] += con.fillBases flank2Success = True else: logging.info("no mapping... picking f1 seq") sequence = FastaFile(data.flank2Seed).values()[0] data.stats["extendSeq2"] = sequence data.stats["contribSeqs"] = 1 data.stats["contribBases"] = len(sequence) data.stats["fillBases"] = len(sequence) flank2Success = True if flank1Success and flank2Success: logging.debug("mid unite") seq = singleOverlapAssembly(data, args) if seq is not None: data.stats["fillSeq"] = seq return
import sys from pbsuite.utils.FileHandlers import FastqFile, M5File from pbsuite.jelly.Support import AlignmentConnector, SUPPORTFLAGS """ Need to do work here """ if __name__ == '__main__': connector = AlignmentConnector() aligns = connector.parseAlignments(M5File(sys.argv[1])) reads = FastqFile(sys.argv[2]) bestScore = None best = None fout = open("reads.fastq",'w') spanCount = 0 for readGroup in aligns: if readGroup[0].qname.startswith("ref"): continue if len(readGroup) == 2: r1, r2 = readGroup a = connector.extendsTarget(r1) b = connector.extendsTarget(r2) if a != SUPPORTFLAGS.none and b != SUPPORTFLAGS.none: spanCount += 1 print r1.qname, "spans" rStart = min(r1.qend, r2.qend) rEnd = max(r1.qstart, r2.qstart) t = reads[r1.qname].subSeq(rStart, rEnd)
def consensusCalling(self, spot, bam, reference, args): """ Make a consensus of all the reads in the region and identify all of the SVs in the region """ # MAXNUMREADS = 100 #I don't think we'll need more than this many reads MAXATTEMPTS = 5 #MAXNUMREADS/2 #I don't feel like trying 100 times SPANBUFFER = 100 #number of bases I want a read to span chrom, start, end = spot.chrom, spot.start, spot.end buffer = args.buffer supportReads = [] spanReads = [] #Fetch reads and trim totCnt = 0 for read in bam.fetch(chrom, max(0, start - buffer - SPANBUFFER), end + buffer + SPANBUFFER): if read.qname not in spot.varReads: continue seq, qual = self.readTrim(read, start - buffer, end + buffer) if read.pos < start - SPANBUFFER and read.aend > end + SPANBUFFER: sz = spot.varReadsSize[spot.varReads.index(read.qname)] spanReads.append((abs(sz - spot.tags["szMedian"]), seq, qual)) else: supportReads.append((seq, qual)) totCnt += 1 if len(spanReads) == 0: logging.debug("noone spans - consensus aborted. %s" % (str(spot))) spot.tags["noSpan"] = True return [spot] #spanReads.sort(reverse=True) spanReads.sort() if len(spanReads) > MAXNUMREADS: origSupportReads = [(x[1], x[2]) for x in spanReads[:MAXNUMREADS]] elif len(spanReads) + len(supportReads) > MAXNUMREADS: origSupportReads = [(x[1], x[2]) for x in spanReads ] + supportReads[:MAXNUMREADS - len(spanReads)] else: origSupportReads = [(x[1], x[2]) for x in spanReads] + supportReads mySpots = [] refReadId = 0 haveVar = False #Attempt each spanRead until we get one that passes while refReadId < len( spanReads) and not haveVar and refReadId < MAXATTEMPTS: refread = spanReads[refReadId] supportReads = origSupportReads[:refReadId] + origSupportReads[ refReadId + 1:] refReadId += 1 #read that spans most of the region goes first #use the rest for cleaning #building consensus sequence foutreads = NamedTemporaryFile(suffix=".fastq") for id, i in enumerate(supportReads): foutreads.write("@%d\n%s\n+\n%s\n" % (id, i[0], i[1])) foutreads.flush() foutref = NamedTemporaryFile(suffix=".fasta") foutref.write(">%s:%d-%d\n%s" % (spot.chrom, start, end, refread[1])) foutref.flush() alignOut = NamedTemporaryFile(suffix=".m5") logging.debug("making the contig....") #run it through phrap #then run it through consensus blasr(foutreads.name, foutref.name, format="-m 5", nproc=1, outname=alignOut.name) if args.consensus == "pbbanana": aligns = M5File(alignOut.name) con = ">con\n%s\n" % consensus(aligns).sequence conName = "pbbanana" elif args.consensus == "pbdagcon": logging.debug("pbdagcon is running") #using minerreads - 1 because one f them is already being used as seed! #I want to be sure I get something out... so just require somebody on there #r, con, e = exe("pbdagcon -c %d -t 0 %s" % (1, alignOut.name), timeout=1) #r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (max(args.minErrReads - 1, 0), alignOut.name), timeout=1) r, con, e = exe("pbdagcon -m 100 -c %d -t 0 %s" % (3, alignOut.name), timeout=1) logging.debug("back from pbdagcon") logging.debug((r, e)) #raw_input("press ent") if con is not None: con = con[con.index("\n") + 1:] else: con = "" conName = "pbdagcon" alignOut.close() foutref.close() foutreads.close() #we don't have a consensus - retry if len(con) == 0: logging.debug("Trying another seed read for consensus") continue logging.debug("%s %d bp seq" % (conName, len(con.split('\n')[1]))) #try improving consensus conOut = NamedTemporaryFile(suffix=".fasta") conOut.write(con) #conOut.close() conOut.flush() refOut = NamedTemporaryFile(suffix=".fasta") #j = reference.fetch(chrom, max(0, start-buffer), end+buffer) #fout = open("f****e.ref.fasta",'w') #fout.write(j) #fout.close() refOut.write(">%s:%d-%d\n%s\n" % (chrom, start, end, \ reference.fetch(chrom, max(0, start-(buffer*2)), end+(buffer*2)))) refOut.flush() #map consensus to refregion varSam = NamedTemporaryFile(suffix=".sam") blasr(conOut.name, refOut.name, format="-sam", outname=varSam.name,\ consensus=False) #-- would this help? #or what if I fed it through leftalign? #os.system("cp %s ." % (refOut.name)) #os.system("cp %s ." % (varSam.name)) sam = pysam.Samfile(varSam.name) matches = 0.0 bases = 0.0 nReads = 0 minVarDiff = 10000 for read in sam: localSpots = [] nReads += 1 spot.tags["consensusCreated"] = True for svstart, svsize, svtype, altseq in expandCigar( read, args.minIndelSize, CONFIRMCOLLAPSE, True): newspot = copy.deepcopy(spot) if spot.svtype == svtype and svtype == "INS": #haveVar = True newspot.start = svstart + start - (buffer * 2) newspot.end = svstart + start - (buffer * 2) newspot.tags["seq"] = altseq newspot.size = svsize gt, gq = genotype(newspot) newspot.tags["GT"] = gt newspot.tags["GQ"] = gq if abs(spot.tags["szMedian"] - newspot.size) < minVarDiff: minVarDiff = abs(spot.tags["szMedian"] - newspot.size) if args.reportContig: newspot.tags["contigseq"] = read.seq newspot.tags["contigqual"] = read.qual localSpots.append(newspot) elif spot.svtype == svtype and svtype == "DEL": #haveVar = True newspot.start = svstart + start - (buffer * 2) newspot.end = svstart + svsize + start - (buffer * 2) newspot.size = svsize gt, gq = genotype(newspot) newspot.tags["GT"] = gt newspot.tags["GQ"] = gq newspot.tags["seq"] = reference.fetch( chrom, newspot.start, newspot.end) if abs(spot.tags["szMedian"] - newspot.size) < minVarDiff: minVarDiff = abs(spot.tags["szMedian"] - newspot.size) if args.reportContig: newspot.tags["contigseq"] = read.seq newspot.tags["contigqual"] = read.qual localSpots.append(newspot) if len(localSpots) > 0: mySpots.append((minVarDiff, localSpots)) #identity = matches/bases #If no var, nothing is returned. #for newspot in mySpots: #newspot.tags["alnIdentityEstimate"] = identity #Keep reporting the actual contigs out until we #find a reason to need it (and also we can get quals...) #vbam.reset() #for id, read in enumerate(vbam): #newspot.tags["contigSeq%d" % (id)] = read.seq #newspot.tags["contigQual%d" % (id)] = read.qual #vbam.close() #varBam.close() refOut.close() #logging.debug("%d consensus reads created %d spots" % (nReads, len(localSpots))) if len(mySpots) == 0: return [] mySpots.sort() return mySpots[0][1]