def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000): # type: (ContigStorage, int, int) -> None for contig in contigs.unique(): if not basic.isCanonocal(contig.id): contig = contig.rc if len(contig) > max_contig: line1 = self.addNew(contig.seq[:cut_size], "L" + contig.id + "l") line2 = self.addNew(contig.seq[-cut_size:], "L" + contig.id + "r") line1.initial.add( AlignmentPiece.Identical( contig.asSegment().prefix(length=cut_size), line1.asSegment())) line2.initial.add( AlignmentPiece.Identical( contig.asSegment().suffix(length=cut_size), line2.asSegment())) line1.tie(line2, len(contig) - 2 * cut_size, contig.seq[cut_size:-cut_size]) else: line = self.addNew(contig.seq, "L" + contig.id) line.initial.add( AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))
def splitLine(self, seg): # type: (Segment) -> Tuple[NewLine, NewLine] sys.stdout.trace("Line operation Split", seg) line = seg.contig # type: NewLine seg1 = line.asSegment().prefix(pos=seg.right) line1 = self.addNew(seg1.Seq(), line.id + "l") seg2 = line.asSegment().suffix(pos=seg.left) line2 = self.addNew(seg2.Seq(), line.id + "r") al1 = AlignmentPiece.Identical(seg1, line1.asSegment()) al2 = AlignmentPiece.Identical(seg2, line2.asSegment()) line1.initial.addAll([ al.embed(al1) for al in line.initial.allInter(seg1, params.min_alignment_size) ]) line2.initial.addAll([ al.embed(al2) for al in line.initial.allInter(seg2, params.min_alignment_size) ]) line1.correct_segments.addAll( line.correct_segments.cap(seg=seg1, min_inter=params.k).map(al1)) line2.correct_segments.addAll( line.correct_segments.cap(seg=seg2, min_inter=params.k).map(al2)) line1.completely_resolved.addAll( line.completely_resolved.cap( seg=seg1, min_inter=params.k).map(al1).filterBySize(min=params.k)) line2.completely_resolved.addAll( line.completely_resolved.cap( seg=seg2, min_inter=params.k).map(al2).filterBySize(min=params.k)) line1.disjointig_alignments.addAll([ al.embed(al1) for al in line.disjointig_alignments.allInter(seg1, params.k) ]) line2.disjointig_alignments.addAll([ al.embed(al2) for al in line.disjointig_alignments.allInter(seg2, params.k) ]) for al in line.read_alignments: if al.seg_to.interSize(seg1) > params.k: line1.addReadAlignment(al.embed(al1)) for al in line.read_alignments: if al.seg_to.interSize(seg2) > params.k: line2.addReadAlignment(al.embed(al2)) line.cleanReadAlignments() self.notifySplitLine(al1, al2) self.remove(line) if line.knot is not None: line2.tie(line.knot.line_right, line.knot.gap, line.knot.gap_seq) if line.rc.knot is not None: line1.rc.tie(line.rc.knot.line_right, line.rc.knot.gap, line.rc.knot.gap_seq) return line1, line2
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def testManual(self): contig1 = Contig("ACGTTAAACGT", "from") contig2 = Contig("ACGTTTAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings()) contig1 = Contig("ACATGATCACT", "from") contig2 = Contig("ACGTGAAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
def test1(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1") line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()), params.alignment_correction_radius) dp = LineDotPlot(lines, self.aligner) dp.addAlignment(al) alignment = AlignmentPiece.Identical( Contig("AGG", "tmp").asSegment(), line2.segment(0, 3)) line2.correctSequence([alignment]) assert str(list(dp.alignmentsToFrom[line2.id][ line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
def test3(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line = lines.addNew("ACGTACGTACGT", "c") dp = LineDotPlot(lines, self.aligner) al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12)) al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12)) dp.addAlignment(al1) dp.addAlignment(al2) alignment = AlignmentPiece.Identical( Contig("TCC", "tmp").asSegment(), line.segment(3, 6)) line.correctSequence([alignment]) assert str( list(dp.auto_alignments["c"]) ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
def testManual(self): contig1 = Contig("ACGTACGTACGT", "c1") contig2 = Contig("ACGTAGGTACGT", "c2") contig3 = Contig("ACTTACGTACGT", "c3") al1 = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al2 = AlignmentPiece.Identical(contig2.asSegment(), contig3.asSegment()) al3 = al1.compose(al2) assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al3.cigar == "12M" al4 = al1.reverse() al5 = al4.composeTargetDifference(al2) assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al5.cigar == "12M"
def merge(self, other): # type: (AlignmentStorage) -> AlignmentStorage left_items = [(al, -1) for al in self] right_items = [(al, 1) for al in other] new_items = sorted(left_items + right_items, key = lambda (al, side): (al.seg_to.contig.id, al.seg_from.contig.id, al.seg_from.left)) new_storge = AlignmentStorage() for (c_to, c_from), it in itertools.groupby(new_items, lambda al: (al[0].seg_to.contig, al[0].seg_from.contig)): al_sides = list(it) als_left = [al for al, side in al_sides if side == -1] # type: List[AlignmentPiece] als_left = sorted(als_left, key = lambda al: al.seg_from.right) als_right = [al for al, side in al_sides if side == 1] # type: List[AlignmentPiece] als_right = sorted(als_right, key = lambda al: al.seg_from.left) merged = [] for al in als_left: for j in range(len(als_right)): # type: int if als_right[j] is None: continue if als_right[j].seg_from.left >= al.seg_from.right: break if als_right[j] is not None and al.canMergeTo(als_right[j]): tmp = AlignmentPiece.MergeOverlappingAlignments([al, als_right[j]]) if tmp is not None: al = tmp als_right[j] = None break merged.append(al) new_storge.addAll(merged) new_storge.addAll([al for al in als_right if al is not None]) return new_storge
def polishSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece sys.stdout.trace("Polishing segment", seg) w = max(900, params.k) r = 50 first = seg.left / w last = min(seg.right + w - 1, len(seg.contig)) / w segs = [] for i in range(first, last + 1): segs.append( Segment(seg.contig, max(0, i * w - r), min(len(seg.contig), (i + 1) * w + r))) als_by_segment = [[] for i in range(last - first + 1)] for al in als: l = al.seg_to.left / w r = al.seg_to.right / w + 1 for i in range(max(0, l - first - 1), min(last - first + 1, r - first + 2)): if al.seg_to.inter(segs[i]): als_by_segment[i].append(al) res_als = [] for seg1, seg_als in zip(segs, als_by_segment): if seg1.inter(seg): res_als.append(self.polishSmallSegment(seg1, seg_als)) res = AlignmentPiece.GlueOverlappingAlignments(res_als) return res.reduce(target=seg)
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])): if len(al.seg_to) >= min(params.k, len(line) - 100): tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) line.correct_segments.add(line.asSegment().shrink(100)) line.completely_resolved.add(line.asSegment().shrink(100)) line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def __iter__(self): # type: () -> Generator[AlignmentPiece] for al in self.content: yield al for al in self.content: yield al.reverse() yield AlignmentPiece.Identical(self.line.asSegment())
def alignAndFilter(self, reads, ref_storage, mode): # type: (Iterable[Contig], ContigStorage, str) -> Generator[AlignmentPiece] filter = self.filters[mode] read_storage = ContigStorage(reads, False) als = [] for rec in self.align(read_storage, list(ref_storage.unique()), mode): if rec.is_unmapped: continue if len(als) > 0 and rec.query_name != als[0].seg_from.contig.id: res = list(filter(als)) for al in res: yield al als = [] if len(als) > 0: seq_from = als[0].seg_from.contig else: seq_from = read_storage[rec.query_name] seq_to = ref_storage[rec.tname] tmp = AlignmentPiece.FromSamRecord(seq_from, seq_to, rec) if tmp is not None: if mode == "dotplot": als.extend(tmp.splitRef()) elif (mode == "local"): als.extend(tmp.splitRead()) elif (mode == "ava-pb"): als.extend(tmp.splitRead()) else: als.append(tmp) if len(als) > 0: res = list(filter(als)) for al in res: yield al
def printAlignments(sam_handler, reference_handler, reads_handler): print "Loading reference" cc = ContigStorage(add_rc=False).loadFromFasta(reference_handler, False) print "Loading query" reads = ContigStorage().loadFromFasta(reads_handler, False) print "Loading result" res = [] for rec in sam_parser.Samfile(sam_handler): if rec.query_name in reads.items and cc[rec.tname] is not None: al = AlignmentPiece.FromSamRecord(reads[rec.query_name], cc[rec.tname], rec) if al is None: print rec.query_name, rec.tname continue if al.seg_to.contig not in cc: al = al.rc res.append(al) print "Printing result", len(res) res = sorted(res, key = lambda al: al.seg_to.left) # res = sorted(res, key = lambda al: len(al))[::-1] up = 0 down = 0 for al in res: print al print list(al.splitRead()) s1, s2 = al.asMatchingStrings() up += s1.count("-") down += s2.count("-") s = [] if len(list(al.splitRead())) > 1: nums = [] for al1 in al.splitRead(): nums.append(al1.seg_from.left) nums.append(al1.seg_from.right - 1) cur_num = 0 cur = al.seg_from.left for c in s1: if cur == nums[cur_num] and c != "-": if cur_num % 2 == 0: s.append("[") else: s.append("]") cur_num += 1 else: if cur_num % 2 == 0: s.append("-") else: s.append("+") if c != "-": cur += 1 print "".join(s) print s1 print s2 print up, down
def testManual(self): contig1 = Contig("ACGTACGTACGT", "from") contig2 = Contig("ACGTACGTACGT", "to") al1 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(0, 4)) al2 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(4, 8)) al3 = AlignmentPiece.Identical(contig1.segment(4, 8), contig2.segment(8, 12)) storage = AlignmentStorage() storage.addAll([al1, al2, al3]) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" assert str( list(storage.rc) ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]" assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]" assert str(list(storage.filterByCoverage(0, 1))) == "[]" assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]" assert str(list(storage.filterByCoverage(2))) == "[]" storage.addAndMergeRight(al3) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" al4 = AlignmentPiece.Identical(contig1.segment(2, 8), contig2.segment(2, 8)) al5 = AlignmentPiece.Identical(contig1.segment(4, 10), contig2.segment(4, 10)) storage.addAll([al4, al5]) assert str( list(storage.calculateCoverage()) ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]" assert str(list(storage.filterByCoverage( 2, 3))) == "[to[2:4], to[8:12-2]]" assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]" assert str( list(storage.getAlignmentsTo(contig2.segment(2, 3))) ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]" assert str(list(storage.getAlignmentsTo(contig2.segment( 2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
def addAndMergeLeft(self, al): # type: (AlignmentPiece) -> None if self.isCanonical(): for i, al1 in enumerate(self.items): # type: int, AlignmentPiece if al.seg_from.inter(al1.seg_from) and al.seg_to.inter(al1.seg_to) and al1.seg_from.left >= al.seg_from.left: tmp = AlignmentPiece.MergeOverlappingAlignments([al, al1]) if tmp is not None: self.items[i] = tmp return self.add(al) else: self.rc.addAndMergeRight(al.rc)
def testManual(self): contig1 = Contig("ACGTAAAAGGGTACGT", "c1") contig2 = Contig("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(contig1.segment(5, 12), contig2.segment(5, 12)), params.alignment_correction_radius) corr = Correction(contig1, contig2, [al]) assert corr.mapPositionsUp(range(len(contig2))) == [ 0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15 ] assert corr.mapPositionsDown(range(len(contig1))) == [ 0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15 ] al2 = AlignmentPiece.Identical(contig2.segment(0, 4)) al3 = AlignmentPiece.Identical(contig2.segment(6, 8)) al4 = AlignmentPiece.Identical(contig2.segment(6, 16)) al5 = AlignmentPiece.Identical(contig2.segment(7, 16)) assert str( corr.composeQueryDifferences([al2, al3, al4, al5]) ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() als = [] line_reads = [reads[rid] for rid in read_ids] if len(line_reads) == 0: sys.stdout.warn("No read alignments in initialization for line", line.id, "Realigning all reads") line_reads = reads for al in aligner.overlapAlign(line_reads, ContigStorage([line])): if len(al.seg_to) >= min(1500, len(line) - 100): als.append(al) als = sorted(als, key=lambda al: (al.seg_from.contig.id, -int( al.percentIdentity() * 100), -len(al))) for key, read_als in itertools.groupby( als, key=lambda al: al.seg_from.contig.id): al = list(read_als)[0] tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) correct_seg = line.asSegment().shrink(100) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd( list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) if len(correct_seg) < params.k: correct_seg = correct_seg.expandRight(params.k - len(correct_seg)) line.correct_segments.add(correct_seg) line.completely_resolved.add(correct_seg) line.initial.add( AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def test5(self): dataset = TestDataset("abcABC") name1 = dataset.addContig("abc") name2 = dataset.addContig("ABC") lines, dp, reads = dataset.genAll(self.aligner) line = lines[name1] sa = dataset.alphabet["a"].seq sb = dataset.alphabet["b"].seq tmp = Contig( sa + "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA" + sb, "tmp") al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)), line.prefix(len=len(sa))) al2 = AlignmentPiece.Identical( tmp.asSegment().suffix(length=len(sb)), line.segment(len(sa), len(sa) + len(sb))) al = AlignmentPiece.MergeFittingAlignments([al1, al2]) line.correctSequence([al]) assert str( list(dp.allInter(line.asSegment())) ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
def testManual(self): contig1 = Contig("ACGTACGTA", "from") contig2 = Contig("ACTACGTACGTACAT", "to") al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8), "2M1I6M") al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15), "8M") glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2]) assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar assert glued.seg_from.Seq( ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
def loadLine(self, handler, disjointigs, reads, contigs): # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None self.id = handler.readToken() self.seq = handler.readToken() self.rc.id = basic.Reverse(self.id) n = handler.readInt() for i in range(n): handler.readToken() handler.readToken() handler.readToken() seg = Segment.load(handler, self) handler.readToken() self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg)) # self.add(AlignmentPiece.load(handler, collection_from, collection_to)) self.correct_segments.load(handler, self) self.completely_resolved.load(handler, self) self.disjointig_alignments.load(handler, disjointigs, self) self.read_alignments.load(handler, reads, self) for al in self.read_alignments: read = al.seg_from.contig #type: AlignedRead read.addAlignment(al) self.max_extension = False
def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def fillFromContigs(self, contigs): # type: (Iterable[Contig]) -> None for contig in UniqueList(contigs): line = self.addNew(contig.seq, "L" + contig.id) line.initial.add( AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))
def mergeLines(self, alignment, k): # type: (AlignmentPiece, int) -> NewLine sys.stdout.trace("Line operation Merge", alignment.seg_from.contig, alignment.seg_to.contig, alignment) line1 = alignment.seg_from.contig #type: NewLine line2 = alignment.seg_to.contig #type: NewLine assert line1 != line2 if len(alignment) < k + 100: sys.stdout.trace( "Prolonging line to ensure alignment of at least k") seg = line2.segment( alignment.seg_to.right, alignment.seg_to.right + k + 100 - len(alignment)) line1.extendRight(seg.Seq()) alignment = alignment.mergeDistant( AlignmentPiece.Identical( line1.asSegment().suffix(length=len(seg)), seg)) # Cutting hanging tips of both lines al_storage = AlignmentStorage() al_storage.add(alignment) storage = TwoLineAlignmentStorage(line1, line2) line2.addListener(storage) line1.addListener(storage.reverse) storage.add(alignment) if alignment.seg_from.right < len(line1): line1.cutRight(alignment.seg_from.right) sys.stdout.trace("Cut right") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) if alignment.seg_to.left > 0: line2.rc.cutRight(len(line2) - alignment.seg_to.left) sys.stdout.trace("Cut left") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) alignment = list(storage.content)[0] # type: AlignmentPiece line2.removeListener(storage) line1.removeListener(storage.reverse) # Making sure line sequences match on the overlap if alignment.seg_from.left > 0: new_seq = Contig( line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() + line2.seq, "new_seq") else: new_seq = Contig(line2.seq, "new_seq") al2 = AlignmentPiece.Identical( line2.asSegment(), new_seq.asSegment().suffix(length=len(line2))) sys.stdout.trace("Al2:", al2) alignment = alignment.compose(al2).reverse() sys.stdout.trace("Composed alignment", alignment) sys.stdout.trace("\n".join(alignment.asMatchingStrings())) sys.stdout.trace(alignment.cigar) assert alignment.seg_to.right == len(line1) assert alignment.seg_from.left == al2.seg_to.left line1.correctSequence([alignment]) # Now lines have exact match name = "(" + ",".join( basic.parseLineName(line1.id) + basic.parseLineName(line2.id)) + ")" line = self.addNew(new_seq.seq, name) assert line.seq.startswith(line1.seq) assert line.seq.endswith(line2.seq) al1 = AlignmentPiece.Identical( line1.asSegment(), line.asSegment().prefix(length=len(line1))) al2 = AlignmentPiece.Identical( line2.asSegment(), line.asSegment().suffix(length=len(line2))) line.initial.addAll( line1.initial.targetAsSegment(al1.seg_to).merge( line2.initial.targetAsSegment(al2.seg_to))) line.correct_segments.addAll( line1.correct_segments.contigAsSegment(al1.seg_to).merge( line2.correct_segments.contigAsSegment(al2.seg_to))) line.completely_resolved.addAll( line1.completely_resolved.contigAsSegment(al1.seg_to).merge( line2.completely_resolved.contigAsSegment(al2.seg_to), k)) line.disjointig_alignments.addAll( line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge( line2.disjointig_alignments.targetAsSegment(al2.seg_to))) for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge( line2.read_alignments.targetAsSegment(al2.seg_to)): line.addReadAlignment(al) line1.cleanReadAlignments() line2.cleanReadAlignments() self.notifyMergedLines(al1, al2) knot_right = line2.knot knot_left = line1.rc.knot self.remove(line1) self.remove(line2) if knot_right is not None: if knot_right.line_right == line1: line.tie(line, knot_right.gap, knot_right.gap_seq) else: line.tie(knot_right.line_right, knot_right.gap, knot_right.gap_seq) if knot_left is not None and knot_left.line_right != line2.rc: line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq) return line
def load(self, handler, collection_from, collection_to): # type: (TokenReader, Any, Any) -> None n = handler.readInt() for i in range(n): self.add(AlignmentPiece.load(handler, collection_from, collection_to))