def main(args): rf = args[2] dir = args[3] CreateLog(dir) disjointigs = ContigCollection().loadFromFasta(open(args[1], "r"), num_names=False) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) clen = 5000000 reads = ReadCollection().loadFromFasta(open(rf, "r")) tlen0 = sum(map(len, reads)) for i in range(10): good_reads = set() for al in aligner.localAlign(reads, disjointigs): if not al.contradictingRTC(al.seg_to.contig.asSegment(), 500): good_reads.add(al.seg_from.contig.id) rf = os.path.join(dir, "reads" + str(i) + ".fasta") reads = reads.filter(lambda read: read.id not in good_reads).cleanCopy() tlen = sum(map(len, reads)) reads.print_fasta(open(rf, "w")) l = tlen * clen / tlen0 assembly_dir = os.path.join(dir, "assembly" + str(i)) subprocess.check_call(["./bin/flye", "-o", assembly_dir, "-t", "8", "--pacbio-raw", rf, "--genome-size", str(l), "--no-trestle"]) df= os.path.join(assembly_dir, "10-consensus", "consensus.fasta") disjointigs.addAll(ContigCollection().loadFromFasta(open(df, "r"), num_names=False)) df = os.path.join(dir, "df" + str(i) + ".fasta") disjointigs.print_fasta(open(df, "w"))
def main(contig_file, reads_file, sam_file, dir, contig_id): # type: (str, str, str, str, str) -> None basic.ensure_dir_existance(dir) contigs = ContigCollection() contigs.loadFromFasta(open(contig_file, "r")) print "Contigs loaded" contig = contigs[contig_id] read_names = set() for rec in Samfile(open(sam_file, "r")): read_names.add(rec.query_name) reads = ReadCollection() cnt = 0 for rec in SeqIO.parse_fasta(open(reads_file, "r")): if rec.id in read_names: rec.id = "Read" + str(cnt) reads.add(AlignedRead(rec)) cnt += 1 reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w")) print "Reads loaded", len(reads) reads.addAllRC() print "RC added", len(reads) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) aligner.alignReadCollection(reads, contigs) print "Reads aligned", len(reads) reads = reads.inter(contig.asSegment()) print "Reads filtered", len(reads) sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left) for read in sorted_reads: print read for al in read.alignmentsTo(contig.asSegment()): print "\n".join(al.asMatchingStrings())
def nextDir(self): # type: () -> str name = os.path.join(self.dir, str(self.cur_dir)) if params.save_alignments: self.cur_dir += 1 assert self.cur_dir <= 100000 basic.ensure_dir_existance(name) return name
def __init__(self, dir, lines, dot_plot, reads, aligner): # type: (str, NewLineStorage, LineDotPlot, ReadCollection, Aligner) -> None self.dir = dir basic.ensure_dir_existance(dir) self.lines = lines self.dot_plot = dot_plot self.reads = reads self.aligner = aligner self.cnt = 0
def constructDisjointigs(reads, total_length, dir): # type: (ReadCollection, int, str) -> str basic.ensure_dir_existance(dir) reads_file = os.path.join(dir, "reads.fasta") disjointigs_file = os.path.join(dir, "disjointigs.fasta") log_file = os.path.join(dir, "log.txt") reads.print_fasta(open(reads_file, "w")) subprocess.check_call([os.path.join(params.bin_path, "flye-modules"), "assemble", "--reads", reads_file, "--out-asm", disjointigs_file, "--genome-size", str(total_length), "--config", "flye/config/bin_cfg/asm_raw_reads.cfg", "--min-ovlp", "1500", "--threads", str(params.threads), "--log", log_file]) return disjointigs_file
def CreateLog(dir): old_logs_dir = os.path.join(dir, "old") basic.ensure_dir_existance(old_logs_dir) log_file = os.path.join(dir, "log.info") if os.path.isfile(log_file): num = len(os.listdir(old_logs_dir)) shutil.copy(log_file, os.path.join(old_logs_dir, str(num) + ".log")) log = open(log_file, "w") sys.stdout = basic.OStreamWrapper(sys.stdout, log) sys.stdout.prefix = lambda s: time.strftime("%I:%M:%S") + " " sys.stderr = sys.stdout
def __init__(self, dir, clean=False): # type: (str, bool) -> None self.dir = dir if clean: basic.recreate(self.dir) else: basic.ensure_dir_existance(self.dir) self.cnt = 0 for name in os.listdir(self.dir): num = basic.parseNumber(name) if num is not None and num >= self.cnt: self.cnt = num + 1
def dump(self, dirname): basic.ensure_dir_existance(dirname) edge_file = os.path.join(dirname, "edges.txt") stats_file = os.path.join(dirname, "stats.txt") init_file = os.path.join(dirname, "init.txt") reads_file = os.path.join(dirname, "reads.txt") contigs_file = os.path.join(dirname, "contigs.fasta") self.printStats(stats_file) self.printEdges(edge_file) self.printInit(init_file) self.printReads(reads_file) self.printContigs(contigs_file)
def main(args): dir = args[4] basic.ensure_dir_existance(dir) CreateLog(dir) sys.stdout.info("Starting graph-free recruitment") print " ".join(args) sys.stdout.info("Loading repeat sequences") seqs = ContigStorage().loadFromFasta(open(args[1], "r"), False) sys.stdout.info("Loading reads") reads = ContigStorage().loadFromFasta(open(args[2], "r"), False) k = int(args[3]) recruit(seqs, reads, k, dir) sys.stdout.info("Finised graph-free recruitment")
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def polish(self, reads, consensus): # type: (Iterable[NamedSequence], Contig) -> str dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
def polishMany(self, reads, sequences): # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig] dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
def align(self, reads, reference, mode): # type: (Iterable[NamedSequence], Iterable[Contig], str) -> sam_parser.Samfile reference = list(reference) dir, new_files, same = self.dir_distributor.fillNextDir([(reference, "contigs.fasta"), (list(reads), "reads.fasta")]) contigs_file = new_files[0] reads_file = new_files[1] alignment_dir = os.path.join(dir, "alignment") alignment_file = os.path.join(dir, "alignment.sam") basic.ensure_dir_existance(dir) basic.ensure_dir_existance(alignment_dir) if same and not params.clean and os.path.exists(alignment_file): sys.stdout.log(common.log_params.LogPriority.alignment_files, "Alignment reused:", alignment_file) pass else: if os.path.isfile(alignment_file): os.remove(alignment_file) self.align_files(contigs_file, [reads_file], self.threads, params.technology, mode, alignment_file) return sam_parser.Samfile(open(alignment_file, "r"))
def extractSubgraph(dir, flye_dir, contigs): basic.ensure_dir_existance(dir) d = parseUPaths(flye_dir) edge_ids = [] for contig in contigs: for s in d[contig]: edge_ids.append(s) graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv")) vertex_ids = set() len = 0 for eid in edge_ids: len += graph.e[eid].len vertex_ids.add(graph.e[eid].start) vertex_ids.add(graph.e[eid].end) if len > 10000: break # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids]) print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids])
def main(ref_file, contig_size, rlen, cov, dir): basic.ensure_dir_existance(dir) all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False) contig_file_name = os.path.join(dir, "contigs.fasta") contig_file = open(contig_file_name, "w") reads_file_name = os.path.join(dir, "reads.fasta") reads_file = open(reads_file_name, "w") for ref in all_contigs.unique(): if len(ref) < contig_size: continue SeqIO.write(ref, contig_file, "fasta") for i in range(0, len(ref), max(1, rlen / cov)): read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence() SeqIO.write(read, reads_file, "fasta") reads_file.close() contig_file.close() print "Done" print contig_file_name print reads_file_name
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
import sys import os sys.path.append("py") from common.SimpleGraph import SimpleGraph from common import basic g = SimpleGraph() g.ReadDot(sys.argv[1]) basic.ensure_dir_existance(sys.argv[2]) args = sys.argv[3:] if "merge" in args: g = g.Merge() cnt = 0 oppa = [] for comp in g.Split(1000000000): if len(comp) < 3: if len(g.v[comp[0]].inc) + len(g.v[comp[0]].out) + len( g.v[comp[-1]].inc) + len(g.v[comp[-1]].out) <= 2: pass else: oppa.extend(comp) if len(oppa) > 30: comp = list(oppa) oppa = [] else: continue print cnt, len(comp) f = open(os.path.join(sys.argv[2], str(cnt) + ".dot"), "w") g.Draw(comp, f)
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()
def main(flye_dir, output_dir, diploid): basic.ensure_dir_existance(output_dir) CreateLog(output_dir) print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) print("Modifications:") print subprocess.check_output(["git", "diff"]) graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv") edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta") dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump") if diploid: calculator = DipolidCalculator(150000) else: calculator = HaploidCalculator(150000) print "Reading graph from", graph_file graph = SimpleGraph() graph.ReadDot(graph_file) print "Reading sequences from", edge_file graph.FillSeq(edge_file, True) print "Splitting graph", edge_file componentRecords, edgecomp = constructComponentRecords(graph, calculator) print "Reading alignment dump from", dump_file rcnt = 0 for rid, eids in AlignmentDumpParser(dump_file).parse(): compids = set() eids = map(basic.Normalize, eids) for eid in eids: for compid in edgecomp[eid]: compids.add(compid) for compid in compids: comp_eids = [ eid for eid in eids if eid in componentRecords[compid].component.e ] if comp_eids.__len__() == 0: print "GOPA", compid, compids, rid, eids componentRecords[compid].addRead(rid, eids) rcnt += 1 if rcnt % 100000 == 0: print "Processed", rcnt, "reads" print "Filling flye repeat resolution results" flye_next = FillFlyeNext(componentRecords, os.path.join(flye_dir, "flye.log")) for compRec in componentRecords: half = compRec.half() for norm_eid in compRec.unique: for eid in [norm_eid, basic.Reverse(norm_eid)]: if eid not in compRec.component.e: assert not basic.isCanonocal(eid) assert basic.Reverse(eid) in compRec.component.e continue if compRec.component.e[eid].end in half: if compRec.component.isBorder( compRec.component.e[eid].end): compRec.out += 1 if compRec.component.isBorder( compRec.component.e[eid].start): compRec.inc += 1 if not compRec.component.isBorder( compRec.component.e[eid].end): if flye_next[eid] is None: compRec.unresolved_connections += 1 else: compRec.resolved_connections.append( (eid, flye_next[eid])) if flye_next[eid] not in compRec.component.e: compRec.outside_connections += 1 basic.ensure_dir_existance(output_dir) print "Printing components to disk" subdataset_dir = os.path.join(output_dir, "subdatasets") basic.ensure_dir_existance(subdataset_dir) order = range(componentRecords.__len__()) order = sorted(order, key=lambda i: componentRecords[i].score()) ordered_components = [ componentRecords[order[i]] for i in range(len(order)) ] componentRecords = ordered_components basic.ensure_dir_existance(os.path.join(output_dir, "pics")) for i, component in enumerate(componentRecords): comp_dir = os.path.join(subdataset_dir, str(i)) component.dump(comp_dir) fig_name = os.path.join(comp_dir, "graph.dot") component.draw(fig_name, calculator) if component.component.__len__() <= 100: fig_file = os.path.join(output_dir, "pics", str(i) + ".dot") component.draw(fig_file, calculator) table_file = os.path.join(output_dir, "table.txt") print "Printing table to file", table_file f = open(table_file, "w") f.write( "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n" ) for i, compRec in enumerate(componentRecords): comp = compRec.component f.write(" ".join([ str(i), str(comp.v.__len__()), str(comp.e.__len__()), str(compRec.unique.__len__() * 2), str(compRec.inc), str(compRec.out), str(compRec.repeat_edges), str(compRec.unresolved_connections), str(compRec.resolved_connections.__len__()), str(compRec.outside_connections), str(compRec.zero), str(compRec.red), str(compRec.bad_border), str(compRec.overcovered_edges), str(compRec.score()) ]) + "\n") f.close() table_file = os.path.join(output_dir, "list.txt") f = open(table_file, "w") for a in range(len(componentRecords)): f.write(str(a) + "\n") f.close()
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)
def __init__(self, dir): basic.ensure_dir_existance(dir) self.dir = dir self.cur_dir = 0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
import os import shutil import sys from common import basic in_dir = sys.argv[1] out_dir = sys.argv[2] basic.ensure_dir_existance(out_dir) for f in os.listdir(in_dir): fig_dir = os.path.join(in_dir, f, "pictures") cur = 0 nums = map(basic.parseNumber, os.listdir(fig_dir)) last = max(*nums) f_name = None for f in os.listdir(fig_dir): if basic.parseNumber(f) == last: f_name = f break shutil.copy(os.path.join(fig_dir, f_name), os.path.join(out_dir, f + ".dot"))