def main(contigs_file, parts_file, dir): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r")) parts = ContigCollection().loadFromFasta(open(parts_file, "r")) basic.CreateLog(dir) aligner = Aligner(DirDistributor(dir)) res = dict() for al in aligner.localAlign(parts, contigs): if al.seg_to.contig.id not in res: res[al.seg_to.contig.id] = [] res[al.seg_to.contig.rc.id] = [] res[al.seg_to.contig.id].append(al) res[al.seg_to.contig.rc.id].append(al.rc) for cname, arr in res.items(): print cname arr = filter( lambda al: len(al.seg_to) > min( len(al.seg_to.contig) - 1000, 5000), arr) arr = sorted(arr, key=lambda al: al.seg_to.left) print arr
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
# type: (List[AlignmentPiece]) -> List[AlignmentPiece] als = filter(lambda al: not al.contradictingRTC(tail_size=params.bad_end_length), als) return self.filterLocal(als) if __name__ == "__main__": dir = sys.argv[1] query = sys.argv[2] target = sys.argv[3] extra_params = sys.argv[4:] contra = "contra" in extra_params over = "over" in extra_params long = "long" in extra_params start = "start" in extra_params forward = "forward" in extra_params aln = Aligner(DirDistributor(dir)) basic.CreateLog(dir) contigs = ContigCollection().loadFromFasta(open(target, "r"), False) for al in aln.localAlign(ReadCollection().loadFromFile(query), contigs): if start: if al.seg_to.contig.id.startswith("-"): al = al.rc if al.seg_to.left > 50: continue if over and al.contradictingRTC(): continue if forward: if al.seg_to.contig.id.startswith("-"): al = al.rc if contra and (len(al) < 8000 or not al.contradictingRTC()): continue if long and len(al) < 5000:
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()