def collectRecords(self, corrected): # type: (List[Segment]) -> List[LineExtender.Record] sys.stdout.trace("Collecting records", corrected) read_bounds = dict() records = dict() # type: Dict[Segment, LineExtender.Record] good_reads = set() for seg in corrected: sys.stdout.trace("Oppa initial:", seg) seg = seg.expandLeft(params.k) sys.stdout.trace("Alignments relevant for", seg, list(self.dot_plot.allInter(seg))) for al in self.dot_plot.allInter(seg): seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg) line = al.seg_from.contig # type:NewLine for seg_correct in line.correct_segments.allInter(al.seg_from): for seg_resolved in line.completely_resolved.allInter( seg_correct): if seg_resolved in records: continue if seg_resolved.right == len(line): next_start = len(line) else: next = line.completely_resolved.find( line.asSegment().suffix( pos=seg_resolved.right), 1) if next is None: next_start = len(line) else: next_start = next.left next_start = min(next_start, len(line) - 200) focus = line.segment( max(seg_resolved.left, min(seg_resolved.right - params.k, seg1.left)), min(seg_correct.right, next_start + params.k)) als = list(line.getRelevantAlignmentsFor(focus)) reads = ContigStorage() for al in als: reads.add(al.seg_from.contig) als = list( self.aligner.localAlign(reads.unique(), ContigStorage([line]))) final_als = [] sys.stdout.trace("Focus:", focus, seg_resolved) sys.stdout.trace(als) for al in als: if al.seg_to.contig == line.rc: al = al.rc if al.seg_to.interSize(focus) >= params.k - 100: final_als.append(al) sys.stdout.trace(final_als) sys.stdout.trace("Finished realignment of reads") records[seg_resolved] = self.createRecord( seg_resolved, next_start, seg_correct, final_als, good_reads, read_bounds) records = list(records.values()) # type: List[LineExtender.Record] return records
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump): # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None sys.stdout.info("Extending short lines") short_contigs = ContigStorage() als = dict() # type: Dict[str, List[AlignmentPiece]] for contig in contigs.unique(): if len(contig) < params.k + 500: short_contigs.add(contig) als[contig.id] = [] als[contig.rc.id] = [] if read_dump is not None: sys.stdout.trace("Using flye read dump file to extend short contigs") relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads) for contig in short_contigs: for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])): als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) else: sys.stdout.trace("Realigning all reads to extend short contigs") for al in aligner.overlapAlign(reads, short_contigs): if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20: added = False for i, al1 in enumerate(als[al.seg_to.contig.id]): if al1.seg_from.contig.id == al.seg_from.contig.id: added = True if al.percentIdentity() > al1.percentIdentity(): als[al.seg_to.contig.id][i] = al als[al.seg_to.contig.rc.id][i] = al.rc break if not added: als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) for contig in short_contigs.unique(): if len(als[contig.id]) > 0: tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig)) r = len(tmp_contig) - len(contig) tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig)) l = len(tmp_contig) - len(contig) - r else: tmp_contig, new_als = contig, als[contig.id] l = 0 r = 0 # if l > params.k / 2 and r > params.k / 2: # tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2] # else: # tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)] if len(tmp_contig) > params.k + 500: sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right") contigs.add(Contig(tmp_contig.rc.seq, contig.id)) else: sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.") contigs.remove(contig)
dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique())) res_file = os.path.join(dir, "res.fasta") rf = open(res_file, "w") for c in res: SeqIO.write(c, rf, "fasta") rf.close() aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap", os.path.join(dir, "res.sam"))
from common import basic, params from common.basic import CreateLog from alignment.align_tools import Aligner, DirDistributor from common.line_align import Scorer from common.sequences import ContigStorage if __name__ == "__main__": basic.ensure_dir_existance(sys.argv[1]) CreateLog(sys.argv[1]) reads = ContigStorage().loadFromFile(sys.argv[2]) contigs = ContigStorage().loadFromFile(sys.argv[3]) scorer = Scorer() dd = DirDistributor(sys.argv[1]) aligner = Aligner(dd) for read in reads.unique(): print "Processing read", read als = [ scorer.polyshAlignment(al, params.alignment_correction_radius) for al in aligner.localAlign([read], contigs) ] for al1 in als: for al2 in als: if al1.seg_to.contig == al2.seg_to.contig: continue print al1, "vs", al2 scorer.scoreInCorrectSegments(al1, al1.seg_to.contig.asSegment(), al2, al2.seg_to.contig.asSegment())
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()