def polishAndAnalyse(self, reads, polishing_base, reliable_start = None): # type: (ReadCollection, Contig, Optional[int]) -> Consensus if reliable_start is None: reliable_start = len(polishing_base) seq = Contig(self.polish(reads, polishing_base), "contig") res = [0] * (len(seq) + 1) alignment = ReadCollection().extendClean(reads) self.aligner.alignReadCollection(alignment, [seq]) contra = 0 ok = 0 late = 0 for read in alignment: for al in read.alignmentsTo(seq.asSegment()):# type: AlignmentPiece if al.contradicting(seq.asSegment()): contra += 1 elif al.seg_to.left > reliable_start: late += 1 else: res[al.seg_to.left] += 1 res[al.seg_to.right] -= 1 ok += 1 for i in range(1, len(res)): res[i] += res[i - 1] sys.stdout.trace("Polyshed and analysed using", len(alignment), "reads. Ok:", ok, "late:", late, "contra:", contra) # if contra > 10 or contra > ok / 2: # for read in alignment: # print read # for al in read.alignmentsTo(seq.asSegment()): # if al.contradictingRTC(seq.asSegment()): # print "contra_al:", al # elif al.seg_to.left > reliable_start: # print "late_al:", al # else: # print "ok_al:", al return Consensus(seq.seq, res)
def testManual(self): contig1 = Contig("ACGTTAAACGT", "from") contig2 = Contig("ACGTTTAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings()) contig1 = Contig("ACATGATCACT", "from") contig2 = Contig("ACGTGAAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
def testManual(self): contig1 = Contig("ACGTACGTACGT", "c1") contig2 = Contig("ACGTAGGTACGT", "c2") contig3 = Contig("ACTTACGTACGT", "c3") al1 = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al2 = AlignmentPiece.Identical(contig2.asSegment(), contig3.asSegment()) al3 = al1.compose(al2) assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al3.cigar == "12M" al4 = al1.reverse() al5 = al4.composeTargetDifference(al2) assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al5.cigar == "12M"
def testManual(self): contig1 = Contig("ACGTACGTA", "from") contig2 = Contig("ACTACGTACGTACAT", "to") al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8), "2M1I6M") al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15), "8M") glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2]) assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar assert glued.seg_from.Seq( ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length): base = Contig(seq, "base") for i in range(len(seq) / min_contig_length): res = splitSegKmeans( aligner, base.segment(i * min_contig_length, i * min_contig_length + min_contig_length), mult, all_reads_list) if res is not None: return res res = splitSegKmeans( aligner, base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult, all_reads_list) return res
def test5(self): dataset = TestDataset("abcABC") name1 = dataset.addContig("abc") name2 = dataset.addContig("ABC") lines, dp, reads = dataset.genAll(self.aligner) line = lines[name1] sa = dataset.alphabet["a"].seq sb = dataset.alphabet["b"].seq tmp = Contig( sa + "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA" + sb, "tmp") al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)), line.prefix(len=len(sa))) al2 = AlignmentPiece.Identical( tmp.asSegment().suffix(length=len(sb)), line.segment(len(sa), len(sa) + len(sb))) al = AlignmentPiece.MergeFittingAlignments([al1, al2]) line.correctSequence([al]) assert str( list(dp.allInter(line.asSegment())) ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def mergeLines(self, alignment, k): # type: (AlignmentPiece, int) -> NewLine sys.stdout.trace("Line operation Merge", alignment.seg_from.contig, alignment.seg_to.contig, alignment) line1 = alignment.seg_from.contig #type: NewLine line2 = alignment.seg_to.contig #type: NewLine assert line1 != line2 if len(alignment) < k + 100: sys.stdout.trace( "Prolonging line to ensure alignment of at least k") seg = line2.segment( alignment.seg_to.right, alignment.seg_to.right + k + 100 - len(alignment)) line1.extendRight(seg.Seq()) alignment = alignment.mergeDistant( AlignmentPiece.Identical( line1.asSegment().suffix(length=len(seg)), seg)) # Cutting hanging tips of both lines al_storage = AlignmentStorage() al_storage.add(alignment) storage = TwoLineAlignmentStorage(line1, line2) line2.addListener(storage) line1.addListener(storage.reverse) storage.add(alignment) if alignment.seg_from.right < len(line1): line1.cutRight(alignment.seg_from.right) sys.stdout.trace("Cut right") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) if alignment.seg_to.left > 0: line2.rc.cutRight(len(line2) - alignment.seg_to.left) sys.stdout.trace("Cut left") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) alignment = list(storage.content)[0] # type: AlignmentPiece line2.removeListener(storage) line1.removeListener(storage.reverse) # Making sure line sequences match on the overlap if alignment.seg_from.left > 0: new_seq = Contig( line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() + line2.seq, "new_seq") else: new_seq = Contig(line2.seq, "new_seq") al2 = AlignmentPiece.Identical( line2.asSegment(), new_seq.asSegment().suffix(length=len(line2))) sys.stdout.trace("Al2:", al2) alignment = alignment.compose(al2).reverse() sys.stdout.trace("Composed alignment", alignment) sys.stdout.trace("\n".join(alignment.asMatchingStrings())) sys.stdout.trace(alignment.cigar) assert alignment.seg_to.right == len(line1) assert alignment.seg_from.left == al2.seg_to.left line1.correctSequence([alignment]) # Now lines have exact match name = "(" + ",".join( basic.parseLineName(line1.id) + basic.parseLineName(line2.id)) + ")" line = self.addNew(new_seq.seq, name) assert line.seq.startswith(line1.seq) assert line.seq.endswith(line2.seq) al1 = AlignmentPiece.Identical( line1.asSegment(), line.asSegment().prefix(length=len(line1))) al2 = AlignmentPiece.Identical( line2.asSegment(), line.asSegment().suffix(length=len(line2))) line.initial.addAll( line1.initial.targetAsSegment(al1.seg_to).merge( line2.initial.targetAsSegment(al2.seg_to))) line.correct_segments.addAll( line1.correct_segments.contigAsSegment(al1.seg_to).merge( line2.correct_segments.contigAsSegment(al2.seg_to))) line.completely_resolved.addAll( line1.completely_resolved.contigAsSegment(al1.seg_to).merge( line2.completely_resolved.contigAsSegment(al2.seg_to), k)) line.disjointig_alignments.addAll( line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge( line2.disjointig_alignments.targetAsSegment(al2.seg_to))) for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge( line2.read_alignments.targetAsSegment(al2.seg_to)): line.addReadAlignment(al) line1.cleanReadAlignments() line2.cleanReadAlignments() self.notifyMergedLines(al1, al2) knot_right = line2.knot knot_left = line1.rc.knot self.remove(line1) self.remove(line2) if knot_right is not None: if knot_right.line_right == line1: line.tie(line, knot_right.gap, knot_right.gap_seq) else: line.tie(knot_right.line_right, knot_right.gap, knot_right.gap_seq) if knot_left is not None and knot_left.line_right != line2.rc: line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq) return line