def attemptJump(self, rec): # type: (Record) -> bool bound = self.findAndFilterResolvedBound(rec, params.l) bad_segments = SegmentStorage() for al in rec: if al.seg_to.left > bound: break if al.seg_from.left > min(params.bad_end_length, params.k / 2) and \ al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) for al in self.dot_plot.allInter( rec.line.segment(rec.resolved.right - params.k, bound)): if al.seg_from.left > min(params.bad_end_length, params.k / 2): if al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) bad_segments.mergeSegments(params.k - 200) sys.stdout.trace("Bad segments:", bad_segments) good_segments = bad_segments.reverse(rec.line, params.k - 100).reduce( rec.line.segment(rec.resolved.right - params.k, bound)) for seg in good_segments: seg = Segment(seg.contig, max(0, seg.left), seg.right) for seg1 in self.segmentsWithGoodCopies(rec.resolved, seg, params.k): if len(seg1) >= params.k and seg1.right > rec.resolved.right: rec.setResolved(seg1) return True return False
def correctSequences(self, interesting_segments): # type: (Iterable[Segment]) -> List[Segment] interesting_segments = list(interesting_segments) to_correct = [] for seg in interesting_segments: line = seg.contig # type: NewLine correct = line.correct_segments.find(seg) next = line.correct_segments.find(line.suffix(correct.right), 1) if next is None: right = len(line) else: right = min(len(line), next.left + params.k / 2) to_correct.append(line.segment(correct.right - params.k / 2, right)) to_correct = sorted(to_correct, key=lambda seg: (basic.Normalize(seg.contig.id), seg.left)) corrected = [] for line_id, it in itertools.groupby( to_correct, key=lambda seg: basic.Normalize( seg.contig.id)): # type: NewLine, Iterable[Segment] it = list(it) line = None # type: NewLine forward = SegmentStorage() backward = SegmentStorage() for seg in it: if seg.contig.id != line_id: backward.add(seg) line = seg.contig.rc else: forward.add(seg) line = seg.contig to_polysh = SegmentStorage() to_polysh.addAll(forward).addAll(backward.rc) to_polysh.mergeSegments() line.addListener(to_polysh) line.addListener(forward) line.rc.addListener(backward) sys.stdout.trace("Polishing:", to_polysh) if (not line.max_extension) and to_polysh[-1].RC().left < 200: l = to_polysh[-1].right if self.attemptExtend(line): to_polysh.add(line.asSegment().suffix(pos=l)) forward.add(line.asSegment().suffix(pos=l)) if (not line.rc.max_extension) and to_polysh[0].left < 200: l = to_polysh[0].RC().right if self.attemptExtend(line.rc): to_polysh.rc.add(line.rc.asSegment().suffix(pos=l)) backward.add(line.rc.asSegment().suffix(pos=l)) to_polysh.mergeSegments() forward.mergeSegments() backward.mergeSegments() line.removeListener(to_polysh) new_segments = self.polyshSegments(line, to_polysh) line.removeListener(forward) line.rc.removeListener(backward) corrected.extend(forward) corrected.extend(backward) line.updateCorrectSegments(line.asSegment()) return corrected
def segmentsWithGoodCopies(self, resolved, seg, inter_size): # type: (Segment, Segment, int) -> List[Segment] als = [ al for al in self.dot_plot.allInter(seg) if al.seg_from.left > 20 or al.rc.seg_to.left > 20 or al.isIdentical() ] segs = SegmentStorage() for al in als: line = al.seg_from.contig # type: NewLine if len(al.seg_to ) >= inter_size and al.seg_from.right > line.initial[ 0].seg_to.left: cap = al.seg_from.cap( line.suffix(pos=line.initial[0].seg_to.left)) incorrect = line.correct_segments.reverse( line, inter_size - 1).reduce(cap) matching = al.matchingSequence() sys.stdout.trace("Incorrect: ", line, cap, incorrect) for seg1 in incorrect: seg2 = matching.mapSegDown(seg.contig, seg1, mapIn=False) sys.stdout.trace( "Relevant unpolished k-mer segment alignment:", seg1, seg2) segs.add(seg2) if al.rc.seg_from.left < 50 and al.seg_to.right >= resolved.right - 100: segs.add( al.seg_to.contig.suffix( pos=al.seg_to.right).expand(inter_size + 100)) sys.stdout.trace("Incoming line:", resolved, seg, al) segs.mergeSegments(inter_size - 1) return list( segs.reverse(seg.contig, inter_size - 1 - max(100, inter_size / 10)).reduce(seg))
def testManual(self): contig = Contig("ACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 1)) storage.add(contig.segment(1, 2)) storage.add(contig.segment(2, 3)) storage.add(contig.segment(3, 4)) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) assert str( storage.rc ) == "ReadStorage-:[-test[0:1], -test[1:2], -test[2:4-1], -test[3:4-0]]", str( storage.rc) storage.mergeSegments(1) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) storage.mergeSegments() assert str(storage) == "ReadStorage+:[test[0:4-0]]", str(storage) assert str(storage.rc) == "ReadStorage-:[-test[0:4-0]]", str( storage.rc) contig = Contig("ACGTACGTACGTACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 5)) storage.add(contig.segment(10, 15)) assert storage.find(contig.segment(5, 10)) == contig.segment( 0, 5), str(storage.find(contig.segment(5, 10))) assert storage.find(contig.segment(6, 10)) == contig.segment( 10, 15), str(storage.find(contig.segment(6, 10))) assert storage.find(contig.segment(5, 9)) == contig.segment(0, 5), str( storage.find(contig.segment(5, 9))) assert storage.find(contig.segment(0, 16)) == contig.segment( 0, 5), str(storage.find(contig.segment(0, 16)))
def markUniqueInLine(self, line): # type: (NewLine) -> None sys.stdout.info("Finding unique in", line) alignments = list(line.read_alignments) # type: List[AlignmentPiece] alignments = sorted(alignments, key=lambda al:al.seg_to.left) sys.stdout.trace("Sorting finished") inc = self.link(line, [al.seg_to.left for al in alignments if al.seg_from.left > 1000 and al.seg_to.left > 50], 20) inc.append((line.segment(len(line) - 1, len(line)), params.min_k_mer_cov)) alignments = sorted(alignments, key=lambda al:al.seg_to.right) out = self.link(line, [al.seg_to.right for al in alignments if al.rc.seg_from.left > 1000 and al.rc.seg_to.left > 50 ], 20) sys.stdout.trace("Linking finished") out.insert(0, (line.segment(0, 1), params.min_k_mer_cov)) sys.stdout.trace( "inc:", inc) sys.stdout.trace( "out:", out) events = [] for seg, val in inc: if val >= params.min_k_mer_cov: events.append((seg.left, -1)) for seg, val in out: if val >= params.min_k_mer_cov: events.append((seg.right, 1)) events= sorted(events) sys.stdout.trace("Events collected and sorted", len(events)) events = [(pos, dir) for pos, dir in events if (dir == -1 or pos < len(line) - 200) and (dir == 1 or pos > - 200)] sys.stdout.trace( events) segs = SegmentStorage() for e1, e2 in zip(events[:-1], events[1:]): seg = line.segment(e1[0], e2[0]) if e1[1] == 1 and e2[1] == -1: if len(seg) > params.max_allowed_unaligned: seg = seg.expand(params.k / 2).expandToSize(params.k + 50) if len(seg) >= params.k: segs.add(seg) elif len(seg) > 50000: segs.add(seg.shrink(3000)) sys.stdout.trace("Unique segments selected") line.cleanReadAlignments() line.read_alignments.clean() all = 0 inter = 0 contradicting = 0 bad_quality = 0 sys.stdout.trace( "Unique segments:", segs) if len(segs) == 0: sys.stdout.trace( "WARNING: line with no resolved segments. Removing", line) return for al in alignments: all += 1 if segs.inter(al.seg_to, params.k): inter += 1 if al.contradictingRTC(tail_size=params.bad_end_length): contradicting += 1 sys.stdout.trace( "Contradicting read alignment", al, str(al.seg_from.contig.alignments)) elif al.percentIdentity() < 0.85: bad_quality += 1 sys.stdout.trace( "Read with bad alignment quality:", al) else: line.addReadAlignment(al) sys.stdout.trace("Read recruitment results. All:", all, "In resolved regions:", inter, "Contradicting:", float(contradicting) / inter, "Bad quality", float(bad_quality) / inter) line.updateCorrectSegments(line.asSegment()) segs = segs.cap(line.correct_segments, params.k) line.completely_resolved.addAll(segs) sys.stdout.trace("The end")