def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000): # type: (ContigStorage, int, int) -> None for contig in contigs.unique(): if not basic.isCanonocal(contig.id): contig = contig.rc if len(contig) > max_contig: line1 = self.addNew(contig.seq[:cut_size], "L" + contig.id + "l") line2 = self.addNew(contig.seq[-cut_size:], "L" + contig.id + "r") line1.initial.add( AlignmentPiece.Identical( contig.asSegment().prefix(length=cut_size), line1.asSegment())) line2.initial.add( AlignmentPiece.Identical( contig.asSegment().suffix(length=cut_size), line2.asSegment())) line1.tie(line2, len(contig) - 2 * cut_size, contig.seq[cut_size:-cut_size]) else: line = self.addNew(contig.seq, "L" + contig.id) line.initial.add( AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))
def analyseSegments(self, segs): # type: (List[Segment]) -> None contigs = ContigStorage() contigs.addAll([seg.asContig() for seg in segs if len(seg) > 5000]) res = [] # type: List[Segment] for al in self.aligner.overlapAlign(self.reads, contigs): if basic.isCanonocal(al.seg_to.contig.id): res.append(al.seg_to) else: res.append(al.seg_to.RC()) res = sorted(res, key=lambda seg: (seg.contig.id, seg.left)) covs = [[0] * params.maxCoverageThreshold for i in range(100)] for contig, it in itertools.groupby(res, key = lambda seg: seg.contig): segs = list(it) shrink = contig.asSegment().shrink(1000) bad_seg = False for cov, slen in self.covSegments(shrink, segs, 1): if cov < 3: bad_seg = True if bad_seg: continue for i in range(len(covs)): k = 500 + i * 100 for cov, slen in self.covSegments(shrink, segs, k): covs[i][min(cov, len(covs[i]) - 1)] += slen self.recs = [CoverageAnalyser.CoverageRecord(500 + i * 100, covs[i]) for i in range(len(covs)) if covs[i] > 1000]
def unique(self): if self.add_rc: for item in self.items.values(): if basic.isCanonocal(item.id): yield item else: for item in UniqueList(self).__iter__(): yield item
def rcEdge(self, edge): # type: (Edge) -> Edge if basic.isCanonocal(edge.id) and basic.Reverse(edge.id) not in self.e: start = self.v[edge.start] end = self.v[edge.end] assert len(start.inc) == len(end.out) and len(start.out) == len( end.inc) return edge return self.e[basic.Reverse(edge.id)]
def addNew(self, seq, name=None): # type: (str, Optional[str]) -> NewLine if name is None: name = "L" + str(self.cnt) self.cnt += 1 else: if not basic.isCanonocal(name): name = basic.Reverse(basic.Reverse(name)) new_line = NewLine(seq, str(name), ExtensionHandler(self.disjointigs, self.aligner)) self.add(new_line) new_line.name_printer = self.name_printer new_line.rc.name_printer = self.name_printer return new_line
def save(self, handler): # type: (TokenWriter) -> None keys = [ key for key in self.lines.items.keys() if basic.isCanonocal(key) ] handler.writeTokens(keys) for l1, d1 in self.alignmentsToFrom.items(): if not basic.isCanonocal(l1): continue for l2, als in d1.items(): if l1 < basic.Normalize(l2): handler.writeToken(l1) handler.writeToken(l2) handler.newLine() als.save(handler) handler.writeToken("0") handler.writeToken("0") handler.newLine() for lid in keys: storage = self.rc_alignments[lid] storage.save(handler) for lid in keys: storage = self.auto_alignments[lid] storage.save(handler)
def construct(self, aligner): # type: (Aligner) -> None for al in aligner.dotplotAlign(self.lines.unique(), self.lines): if len(al) > params.k and al.percentIdentity() > 0.8: if al.seg_from.contig.id == al.seg_to.contig.id: ok = al.seg_from <= al.seg_to elif al.seg_from.contig == al.seg_to.contig.rc: if basic.isCanonocal(al.seg_from.contig.id): ok = al.seg_from < al.seg_to.RC() else: ok = al.seg_from.RC() < al.seg_to else: ok = basic.canonical( al.seg_from.contig.id) < basic.canonical( al.seg_to.contig.id) if ok: self.addAlignment(al)
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def realignLine(self, line): # type: (NewLine) -> None for storage in self.alignmentsToFrom[line.id].values(): line_from = storage.line_from # type: NewLine self.alignmentsToFrom[line_from.rc.id][line.rc.id].content.clean() self.alignmentsToFrom[line.rc.id][line_from.rc.id].content.clean() self.rc_alignments[line.id].content.clean() self.rc_alignments[line.rc.id].content.clean() self.auto_alignments[line.id].content.clean() self.auto_alignments[line.rc.id].content.clean() for al in self.aligner.dotplotAlign([line], self.lines): if len(al) > params.k and al.percentIdentity() > 0.8: if al.seg_from.contig.id == al.seg_to.contig.id: ok = al.seg_from <= al.seg_to elif al.seg_from.contig == al.seg_to.contig.rc: if basic.isCanonocal(al.seg_from.contig.id): ok = al.seg_from < al.seg_to.RC() else: ok = al.seg_from.RC() < al.seg_to else: ok = True if ok: self.addAlignment(al)
dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique())) res_file = os.path.join(dir, "res.fasta") rf = open(res_file, "w") for c in res: SeqIO.write(c, rf, "fasta") rf.close() aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap", os.path.join(dir, "res.sam"))
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
import sys sys.path.append("py") from common import basic, SeqIO from common.SimpleGraph import SimpleGraph graph = SimpleGraph().ReadGFA(sys.argv[1]) for e_id in graph.e: if basic.isCanonocal(e_id): SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
def main(flye_dir, output_dir, diploid): basic.ensure_dir_existance(output_dir) CreateLog(output_dir) print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) print("Modifications:") print subprocess.check_output(["git", "diff"]) graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv") edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta") dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump") if diploid: calculator = DipolidCalculator(150000) else: calculator = HaploidCalculator(150000) print "Reading graph from", graph_file graph = SimpleGraph() graph.ReadDot(graph_file) print "Reading sequences from", edge_file graph.FillSeq(edge_file, True) print "Splitting graph", edge_file componentRecords, edgecomp = constructComponentRecords(graph, calculator) print "Reading alignment dump from", dump_file rcnt = 0 for rid, eids in AlignmentDumpParser(dump_file).parse(): compids = set() eids = map(basic.Normalize, eids) for eid in eids: for compid in edgecomp[eid]: compids.add(compid) for compid in compids: comp_eids = [ eid for eid in eids if eid in componentRecords[compid].component.e ] if comp_eids.__len__() == 0: print "GOPA", compid, compids, rid, eids componentRecords[compid].addRead(rid, eids) rcnt += 1 if rcnt % 100000 == 0: print "Processed", rcnt, "reads" print "Filling flye repeat resolution results" flye_next = FillFlyeNext(componentRecords, os.path.join(flye_dir, "flye.log")) for compRec in componentRecords: half = compRec.half() for norm_eid in compRec.unique: for eid in [norm_eid, basic.Reverse(norm_eid)]: if eid not in compRec.component.e: assert not basic.isCanonocal(eid) assert basic.Reverse(eid) in compRec.component.e continue if compRec.component.e[eid].end in half: if compRec.component.isBorder( compRec.component.e[eid].end): compRec.out += 1 if compRec.component.isBorder( compRec.component.e[eid].start): compRec.inc += 1 if not compRec.component.isBorder( compRec.component.e[eid].end): if flye_next[eid] is None: compRec.unresolved_connections += 1 else: compRec.resolved_connections.append( (eid, flye_next[eid])) if flye_next[eid] not in compRec.component.e: compRec.outside_connections += 1 basic.ensure_dir_existance(output_dir) print "Printing components to disk" subdataset_dir = os.path.join(output_dir, "subdatasets") basic.ensure_dir_existance(subdataset_dir) order = range(componentRecords.__len__()) order = sorted(order, key=lambda i: componentRecords[i].score()) ordered_components = [ componentRecords[order[i]] for i in range(len(order)) ] componentRecords = ordered_components basic.ensure_dir_existance(os.path.join(output_dir, "pics")) for i, component in enumerate(componentRecords): comp_dir = os.path.join(subdataset_dir, str(i)) component.dump(comp_dir) fig_name = os.path.join(comp_dir, "graph.dot") component.draw(fig_name, calculator) if component.component.__len__() <= 100: fig_file = os.path.join(output_dir, "pics", str(i) + ".dot") component.draw(fig_file, calculator) table_file = os.path.join(output_dir, "table.txt") print "Printing table to file", table_file f = open(table_file, "w") f.write( "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n" ) for i, compRec in enumerate(componentRecords): comp = compRec.component f.write(" ".join([ str(i), str(comp.v.__len__()), str(comp.e.__len__()), str(compRec.unique.__len__() * 2), str(compRec.inc), str(compRec.out), str(compRec.repeat_edges), str(compRec.unresolved_connections), str(compRec.resolved_connections.__len__()), str(compRec.outside_connections), str(compRec.zero), str(compRec.red), str(compRec.bad_border), str(compRec.overcovered_edges), str(compRec.score()) ]) + "\n") f.close() table_file = os.path.join(output_dir, "list.txt") f = open(table_file, "w") for a in range(len(componentRecords)): f.write(str(a) + "\n") f.close()
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)