예제 #1
0
 def attemptJump(self, rec):
     # type: (Record) -> bool
     bound = self.findAndFilterResolvedBound(rec, params.l)
     bad_segments = SegmentStorage()
     for al in rec:
         if al.seg_to.left > bound:
             break
         if al.seg_from.left > min(params.bad_end_length, params.k / 2) and \
                 al.rc.seg_from.left > min(params.bad_end_length, params.k / 2):
             bad_segments.add(al.seg_to)
     for al in self.dot_plot.allInter(
             rec.line.segment(rec.resolved.right - params.k, bound)):
         if al.seg_from.left > min(params.bad_end_length, params.k / 2):
             if al.rc.seg_from.left > min(params.bad_end_length,
                                          params.k / 2):
                 bad_segments.add(al.seg_to)
     bad_segments.mergeSegments(params.k - 200)
     sys.stdout.trace("Bad segments:", bad_segments)
     good_segments = bad_segments.reverse(rec.line, params.k - 100).reduce(
         rec.line.segment(rec.resolved.right - params.k, bound))
     for seg in good_segments:
         seg = Segment(seg.contig, max(0, seg.left), seg.right)
         for seg1 in self.segmentsWithGoodCopies(rec.resolved, seg,
                                                 params.k):
             if len(seg1) >= params.k and seg1.right > rec.resolved.right:
                 rec.setResolved(seg1)
                 return True
     return False
예제 #2
0
 def correctSequences(self, interesting_segments):
     # type: (Iterable[Segment]) -> List[Segment]
     interesting_segments = list(interesting_segments)
     to_correct = []
     for seg in interesting_segments:
         line = seg.contig  # type: NewLine
         correct = line.correct_segments.find(seg)
         next = line.correct_segments.find(line.suffix(correct.right), 1)
         if next is None:
             right = len(line)
         else:
             right = min(len(line), next.left + params.k / 2)
         to_correct.append(line.segment(correct.right - params.k / 2,
                                        right))
     to_correct = sorted(to_correct,
                         key=lambda seg:
                         (basic.Normalize(seg.contig.id), seg.left))
     corrected = []
     for line_id, it in itertools.groupby(
             to_correct, key=lambda seg: basic.Normalize(
                 seg.contig.id)):  # type: NewLine, Iterable[Segment]
         it = list(it)
         line = None  # type: NewLine
         forward = SegmentStorage()
         backward = SegmentStorage()
         for seg in it:
             if seg.contig.id != line_id:
                 backward.add(seg)
                 line = seg.contig.rc
             else:
                 forward.add(seg)
                 line = seg.contig
         to_polysh = SegmentStorage()
         to_polysh.addAll(forward).addAll(backward.rc)
         to_polysh.mergeSegments()
         line.addListener(to_polysh)
         line.addListener(forward)
         line.rc.addListener(backward)
         sys.stdout.trace("Polishing:", to_polysh)
         if (not line.max_extension) and to_polysh[-1].RC().left < 200:
             l = to_polysh[-1].right
             if self.attemptExtend(line):
                 to_polysh.add(line.asSegment().suffix(pos=l))
                 forward.add(line.asSegment().suffix(pos=l))
         if (not line.rc.max_extension) and to_polysh[0].left < 200:
             l = to_polysh[0].RC().right
             if self.attemptExtend(line.rc):
                 to_polysh.rc.add(line.rc.asSegment().suffix(pos=l))
                 backward.add(line.rc.asSegment().suffix(pos=l))
         to_polysh.mergeSegments()
         forward.mergeSegments()
         backward.mergeSegments()
         line.removeListener(to_polysh)
         new_segments = self.polyshSegments(line, to_polysh)
         line.removeListener(forward)
         line.rc.removeListener(backward)
         corrected.extend(forward)
         corrected.extend(backward)
         line.updateCorrectSegments(line.asSegment())
     return corrected
예제 #3
0
 def segmentsWithGoodCopies(self, resolved, seg, inter_size):
     # type: (Segment, Segment, int) -> List[Segment]
     als = [
         al for al in self.dot_plot.allInter(seg) if al.seg_from.left > 20
         or al.rc.seg_to.left > 20 or al.isIdentical()
     ]
     segs = SegmentStorage()
     for al in als:
         line = al.seg_from.contig  # type: NewLine
         if len(al.seg_to
                ) >= inter_size and al.seg_from.right > line.initial[
                    0].seg_to.left:
             cap = al.seg_from.cap(
                 line.suffix(pos=line.initial[0].seg_to.left))
             incorrect = line.correct_segments.reverse(
                 line, inter_size - 1).reduce(cap)
             matching = al.matchingSequence()
             sys.stdout.trace("Incorrect: ", line, cap, incorrect)
             for seg1 in incorrect:
                 seg2 = matching.mapSegDown(seg.contig, seg1, mapIn=False)
                 sys.stdout.trace(
                     "Relevant unpolished k-mer segment alignment:", seg1,
                     seg2)
                 segs.add(seg2)
             if al.rc.seg_from.left < 50 and al.seg_to.right >= resolved.right - 100:
                 segs.add(
                     al.seg_to.contig.suffix(
                         pos=al.seg_to.right).expand(inter_size + 100))
                 sys.stdout.trace("Incoming line:", resolved, seg, al)
     segs.mergeSegments(inter_size - 1)
     return list(
         segs.reverse(seg.contig, inter_size - 1 -
                      max(100, inter_size / 10)).reduce(seg))
예제 #4
0
 def testManual(self):
     contig = Contig("ACGT", "test")
     storage = SegmentStorage()
     storage.add(contig.segment(0, 1))
     storage.add(contig.segment(1, 2))
     storage.add(contig.segment(2, 3))
     storage.add(contig.segment(3, 4))
     assert str(
         storage
     ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str(
         storage)
     assert str(
         storage.rc
     ) == "ReadStorage-:[-test[0:1], -test[1:2], -test[2:4-1], -test[3:4-0]]", str(
         storage.rc)
     storage.mergeSegments(1)
     assert str(
         storage
     ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str(
         storage)
     storage.mergeSegments()
     assert str(storage) == "ReadStorage+:[test[0:4-0]]", str(storage)
     assert str(storage.rc) == "ReadStorage-:[-test[0:4-0]]", str(
         storage.rc)
     contig = Contig("ACGTACGTACGTACGT", "test")
     storage = SegmentStorage()
     storage.add(contig.segment(0, 5))
     storage.add(contig.segment(10, 15))
     assert storage.find(contig.segment(5, 10)) == contig.segment(
         0, 5), str(storage.find(contig.segment(5, 10)))
     assert storage.find(contig.segment(6, 10)) == contig.segment(
         10, 15), str(storage.find(contig.segment(6, 10)))
     assert storage.find(contig.segment(5, 9)) == contig.segment(0, 5), str(
         storage.find(contig.segment(5, 9)))
     assert storage.find(contig.segment(0, 16)) == contig.segment(
         0, 5), str(storage.find(contig.segment(0, 16)))
예제 #5
0
 def markUniqueInLine(self, line):
     # type: (NewLine) -> None
     sys.stdout.info("Finding unique in", line)
     alignments = list(line.read_alignments) # type: List[AlignmentPiece]
     alignments = sorted(alignments, key=lambda al:al.seg_to.left)
     sys.stdout.trace("Sorting finished")
     inc = self.link(line, [al.seg_to.left for al in alignments if al.seg_from.left > 1000 and al.seg_to.left > 50], 20)
     inc.append((line.segment(len(line) - 1, len(line)), params.min_k_mer_cov))
     alignments = sorted(alignments, key=lambda al:al.seg_to.right)
     out = self.link(line, [al.seg_to.right for al in alignments if al.rc.seg_from.left > 1000 and al.rc.seg_to.left > 50 ], 20)
     sys.stdout.trace("Linking finished")
     out.insert(0, (line.segment(0, 1), params.min_k_mer_cov))
     sys.stdout.trace( "inc:", inc)
     sys.stdout.trace( "out:", out)
     events = []
     for seg, val in inc:
         if val >= params.min_k_mer_cov:
             events.append((seg.left, -1))
     for seg, val in out:
         if val >= params.min_k_mer_cov:
             events.append((seg.right, 1))
     events= sorted(events)
     sys.stdout.trace("Events collected and sorted", len(events))
     events = [(pos, dir) for pos, dir in events if (dir == -1 or pos < len(line) - 200) and (dir == 1 or pos > - 200)]
     sys.stdout.trace( events)
     segs = SegmentStorage()
     for e1, e2 in zip(events[:-1], events[1:]):
         seg = line.segment(e1[0], e2[0])
         if e1[1] == 1 and e2[1] == -1:
             if len(seg) > params.max_allowed_unaligned:
                 seg = seg.expand(params.k / 2).expandToSize(params.k + 50)
                 if len(seg) >= params.k:
                     segs.add(seg)
         elif len(seg) > 50000:
             segs.add(seg.shrink(3000))
     sys.stdout.trace("Unique segments selected")
     line.cleanReadAlignments()
     line.read_alignments.clean()
     all = 0
     inter = 0
     contradicting = 0
     bad_quality = 0
     sys.stdout.trace( "Unique segments:", segs)
     if len(segs) == 0:
         sys.stdout.trace( "WARNING: line with no resolved segments. Removing", line)
         return
     for al in alignments:
         all += 1
         if segs.inter(al.seg_to, params.k):
             inter += 1
             if al.contradictingRTC(tail_size=params.bad_end_length):
                 contradicting += 1
                 sys.stdout.trace( "Contradicting read alignment", al, str(al.seg_from.contig.alignments))
             elif al.percentIdentity() < 0.85:
                 bad_quality += 1
                 sys.stdout.trace( "Read with bad alignment quality:", al)
             else:
                 line.addReadAlignment(al)
     sys.stdout.trace("Read recruitment results. All:", all, "In resolved regions:", inter,
                      "Contradicting:", float(contradicting) / inter, "Bad quality", float(bad_quality) / inter)
     line.updateCorrectSegments(line.asSegment())
     segs = segs.cap(line.correct_segments, params.k)
     line.completely_resolved.addAll(segs)
     sys.stdout.trace("The end")