def printSegs(f, segs): c = ContigStorage().loadFromFasta(open(f, "r"), False) for seg in segs: if seg[2] == 0: seg[2] = len(c[seg[0]]) SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout, "fasta")
def printKnottedToFasta(self, handler): # type: (BinaryIO) -> None printed = set() cnt = 1 for chain in self.chains(): if chain[0].rc.id in printed: continue for line in chain: printed.add(line.id) seq = [] id = [] if chain[-1].knot is not None: id.append("Circular") for line in chain: id.append(line.id) if line.knot is not None: id.append(str(line.knot.gap)) if line.knot.gap < 0: seq.append(line.seq[:line.knot.gap]) else: seq.append(line.seq) seq.append(line.knot.gap_seq) else: seq.append(line.seq) sys.stdout.trace(cnt, ":", ";".join(id)) SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)), handler, "fasta") cnt += 1
def simulate1(dir, mutation, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close()
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list): recs = [] for fn in disjointigs_file_list: for rec in SeqIO.parse_fasta(open(fn, "r")): recs.append(rec) h = open(disjointigs_file, "w") for rec in recs: SeqIO.write(rec, h, "fasta") h.close()
def collect_contigs(dataset, barcodes_dir, output_base, format): output = open(output_base + "." + format, "w") for barcode in dataset: file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format) if os.path.exists(file): contigs = SeqIO.parse(open(file), format) for contig in contigs: contig.id = barcode.id + "-" + contig.id SeqIO.write(contig, output, format) output.close()
def PrintResults(recs, reference, references_file, coordinates_file): aln = open(coordinates_file, "w") fasta = open(references_file, "w") for rec in recs: aln.write(str(rec) + "\n") sequence = reference[rec.rname][rec.left:rec.right] rec_id = str(rec.rname) + "_(" + str(rec.left) + "-" + str(rec.right)+")" SeqIO.write(SeqIO.SeqRecord(sequence, rec_id), fasta, "fasta") aln.close() fasta.close()
def simulate2(dir, mutation, error_rate, genome): print "Simulating", genome ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation) genome_seq = ds.mutate(ds.genome, mutation / 2)[0] total = 0 f = open(os.path.join(dir, genome + ".fasta"), "w") SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta") f.close() f = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 while total < len(genome_seq) * 30: l = random.randint(3000, 3500) pos = random.randint(0, len(genome_seq) - l) seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0] SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta") cnt += 1 total += len(seq) f.close()
def main(ref_file, contig_size, rlen, cov, dir): basic.ensure_dir_existance(dir) all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False) contig_file_name = os.path.join(dir, "contigs.fasta") contig_file = open(contig_file_name, "w") reads_file_name = os.path.join(dir, "reads.fasta") reads_file = open(reads_file_name, "w") for ref in all_contigs.unique(): if len(ref) < contig_size: continue SeqIO.write(ref, contig_file, "fasta") for i in range(0, len(ref), max(1, rlen / cov)): read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence() SeqIO.write(read, reads_file, "fasta") reads_file.close() contig_file.close() print "Done" print contig_file_name print reads_file_name
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
import sys import os import common.seq_records sys.path.append("py") import common.SeqIO as SeqIO read_len = int(sys.argv[2]) cov = float(sys.argv[3]) for seq in SeqIO.parse_fasta(open(sys.argv[1], "r")): sys.stderr.write(seq.id + " " + str(len(seq)) + " " + str(int(len(seq) * cov / read_len)) + "\n") # if len(seq) > 100000000 or len(seq) < 10000000: # continue cur = 100000 for i in range(0, len(seq), int(read_len / cov)): if i > cur: sys.stderr.write(str(cur) + "\n") cur = cur * 3 / 2 SeqIO.write(common.seq_records.SeqRecord(seq.seq[i:min(len(seq), i + read_len)], seq.id + "_" + str(i)), sys.stdout, "fasta") break
def writeToFasta(self, handler): for contig in self.unique(): SeqIO.write(contig, handler, "fasta")
def print_fasta(self, handler): # type: (BinaryIO) -> None SeqIO.write(self, handler, "fasta")
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()
def main(contigs_file, contig_name, reads_file, dir, k): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) contig = contigs[contig_name] contigs = ContigStorage() contigs.add(contig) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) als = list(aligner.localAlign(reads.unique(), contigs)) tmp = [] for al in als: if al.seg_to.contig != contig: al = al.rc tmp.append(al) als = tmp als = sorted(als, key=lambda al: al.seg_to.left / 50 * 1000000 + al.seg_to.right - al.seg_to.left) counts = dict() for al in als: counts[al.seg_from.contig.id] = 0 for al in als: if len(al) > k: counts[al.seg_from.contig.id] += 1 w = 20 f = open(os.path.join(dir, "reads.fasta"), "w") over = set() inter = set() for al in als: if len(al) < k: continue inter.add(basic.Normalize(al.seg_from.contig.id)) if not al.contradictingRTC(): over.add(basic.Normalize(al.seg_from.contig.id)) m = al.matchingSequence(True) tmp = [] for i in range(len(contig) / w + 1): tmp.append([]) for a, b in m.matches: tmp[b / w].append((a, b)) for i in range(len(contig) / w): if i + 1 < len(tmp) and len(tmp[i + 1]) > 0: tmp[i].append(tmp[i + 1][0]) for i in range(len(contig) / w): seg = contig.segment(i * w, i * w + w) if al.seg_to.inter(seg): if al.seg_to.left >= seg.left and al.seg_from.left > params.bad_end_length: sys.stdout.write("B") elif al.seg_to.right <= seg.right and al.rc.seg_from.left > params.bad_end_length: sys.stdout.write("E") else: if len(tmp[i]) == 0: sys.stdout.write("*") else: a = tmp[i][-1][0] - tmp[i][0][0] b = tmp[i][-1][1] - tmp[i][0][1] if a - b > 30: sys.stdout.write("I") elif a - b > 15: sys.stdout.write("i") elif a - b < -30: sys.stdout.write("D") elif a - b < -15: sys.stdout.write("d") else: sys.stdout.write( str(min(8, max(a, b) + 1 - len(tmp[i])))) else: sys.stdout.write("*") print " ", al.seg_from.contig.id, counts[ al.seg_from.contig.id], al.contradictingRTC() print inter for rid in inter: SeqIO.write(reads[rid], f, "fasta") print rid, reads[rid] f.close() f = open(os.path.join(dir, "reads_over.fasta"), "w") for rid in over: SeqIO.write(reads[rid], f, "fasta") f.close()
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2): basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) # contig = contigs[contig_name].asSegment().prefix(length=2000).asContig() contig = contigs[contig_name] reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False) reads1 = ContigStorage() reads2 = ContigStorage() cnt = 0 for read in reads.unique(): cnt += 1 # if cnt % 2 == 0: if read.id in initial_reads1: reads1.add(read) elif read.id in initial_reads2: reads2.add(read) polisher = Polisher(aligner, dd) contig1 = contig contig2 = contig scorer = Scorer() for i in range(3): diff = 0 print "Iteration", i als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig) als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig) contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1") contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2") al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1) als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1) als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2) als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2) als1 = sorted(als1, key = lambda al: al.seg_from.contig.id) als2 = sorted(als2, key = lambda al: al.seg_from.contig.id) reads1 = ContigStorage() reads2 = ContigStorage() dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity() als_map = dict() for al in als1: als_map[al.seg_from.contig.id] = [al] for al in als2: if al.seg_from.contig.id in als_map: als_map[al.seg_from.contig.id].append(al) com_res = [] diffs = [] for tmp_als in als_map.values(): if len(tmp_als) != 2: continue al1 = tmp_als[0] al2 = tmp_als[1] print al1, al2 assert al1.seg_from.contig == al2.seg_from.contig pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity() pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity() com_res.append((al1, al2, pi1 - pi2)) diffs.append(pi1 - pi2) diffs = sorted(diffs) th1 = diffs[len(diffs) / 4] th2 = diffs[len(diffs) * 3 / 4] print "Thresholds:", th1, th2 for al1, al2, diff in com_res: if diff < th1: reads1.add(al1.seg_from.contig) elif diff > th2: reads2.add(al2.seg_from.contig) # if pi1 > pi2 + dp / 4: # reads1.add(al1.seg_from.contig) # elif pi2 > pi1 + dp / 4: # reads2.add(al2.seg_from.contig) # diff += abs(pi1 - pi2) print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2 al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next() print al print "\n".join(al.asMatchingStrings2()) for read in reads1: if read.id in initial_reads1: sys.stdout.write(read.id + " ") print "" for read in reads2: if read.id in initial_reads2: sys.stdout.write(read.id + " ") print "" contig1 = prolong(aligner, polisher, contig1, reads1) contig2 = prolong(aligner, polisher, contig2, reads2) contig1.id = "1" contig2.id = "2" out = open(os.path.join(dir, "copies.fasta"), "w") SeqIO.write(contig1, out, "fasta") SeqIO.write(contig2, out, "fasta") out.close() out = open(os.path.join(dir, "reads1.fasta"), "w") for read in reads1.unique(): SeqIO.write(read, out, "fasta") out.close() out = open(os.path.join(dir, "reads2.fasta"), "w") for read in reads2.unique(): SeqIO.write(read, out, "fasta") out.close() print "Finished"
def printToFasta(self, handler): # type: (BinaryIO) -> None for line in UniqueList(self.items.values()): SeqIO.write(line, handler, "fasta")
def OutputResults(output_file, format, result): output = open(output_file + "." + format, "w") for contig in result: SeqIO.write(contig, output, format) output.close()
def main(args): flye_dir = sys.argv[1] repeats, starts, ends = parse(sys.argv[2]) graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir( flye_dir) dump = os.path.join(rep_dir, "read_alignment_dump") reads_file = sys.argv[3] dir = sys.argv[4] CreateLog(dir) print " ".join(args) print "Printing contigs" edges_file = os.path.join(rep_dir, "graph_before_rr.fasta") edges = ContigStorage().loadFromFasta(open(edges_file, "r")) unique = open(os.path.join(dir, "contigs"), "w") for l in starts: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[-15000:] SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique, "fasta") for l in ends: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[:15000] SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"), unique, "fasta") unique.close() print "Selecting reads" reads = set() cur_read = None als = [] for s in open(dump).readlines(): if s.startswith("Chain"): if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print als for al in als: if al in repeats: if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print "oppa" reads.add(cur_read) break als = [] else: s = s.split() cur_read = s[2][1:] eid = s[6].split("_")[1] if s[6][0] == "-": eid = "-" + eid if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print eid als.append(eid) print "Selected", len(reads), "reads" print "\n".join(reads) print "Reading and printing reads" freads = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 for read in SeqIO.parse_by_name(reads_file): cnt += 1 if cnt % 10000 == 0: print cnt if read.id in reads: SeqIO.write(read, freads, "fasta") freads.close()
import sys sys.path.append("py") from common import SeqIO rf = sys.argv[1] outf = sys.argv[2] total = int(sys.argv[3]) print "Reading reads" reads = list(SeqIO.parse_fasta(open(rf, "r"))) print "Sorting reads" reads = sorted(reads, key=lambda read: -len(read)) print "Printing reads" out = open(outf, "w") for read in reads: if total <= 0: break total -= len(read) SeqIO.write(read, out, "fasta") out.close()
def main(argv): sys.stdout.write("Started\n") dot_file = argv[1] edge_sequences = argv[2] reference_file = argv[3] alignment_file = argv[4] edges = ParseVertices(argv[5]) output_file = argv[6] sys.stdout.write("Loading dot\n") dot = DotParser(open(dot_file, "r")).parse() edge_collection = ContigCollection().loadFromFasta( open(edge_sequences, "r"), True) graph = Graph().loadFromDot(edge_collection, dot) vertices = [graph.E[id].start.id for id in edges] graph.printToFile(sys.stdout) print vertices ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) print "Looking for relevant" pq = PriorityQueue() for v in graph.V.values(): if v.id in vertices: pq.push((0, v)) visited = [] while not pq.empty(): d, v = pq.pop() if v in visited: continue visited.append(v) for e in v.inc: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.start)) for e in v.out: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.end)) print "Visited", len(visited) print map(str, list(visited)) relevant = [] edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences, "r")).addAllRC() for edge in graph.E.values(): if edge.start in visited or edge.start.rc in visited: relevant.append(edge_alignments[edge.id]) print "Loading sam" edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref) for rel in relevant: print rel.__str__() print "Collecting segments" segments = [] chr1 = ref["chr1"] for edge in relevant: for al in edge.alignments: print al if al.seg_from.inter(edge.prefix(dist)): l = dist - al.seg_from.left contig = al.seg_to.contig start = al.seg_to.left segments.append( Segment(contig, start, min(start + l, len(contig)))) print segments[-1] tmp = [] print "Rotating" for seg in segments: if seg.contig != chr1: seg = seg.RC() if seg.contig != chr1: print "WARNING", seg tmp.append(seg) segments = sorted(tmp, key=lambda seg: seg.left) print "All relevant segments" print "\n".join(map(str, segments)) cur_seg = None interesting_segments = [] print "Gluing" for seg in segments: if cur_seg is None: cur_seg = seg.copy() continue if cur_seg.right + 20000 < seg.left: interesting_segments.append(cur_seg.copy()) cur_seg = seg.copy() else: cur_seg.right = max(cur_seg.right, seg.right) if cur_seg is not None: interesting_segments.append(cur_seg.copy()) alignments = [] for edge in edge_alignments: for al in edge.alignments: ok = False for seg in interesting_segments: if al.seg_to.inter(seg): alignments.append(al) alignments = sorted(alignments, key=lambda al: al.seg_to.left) print "All relevant alignments" print "\n".join(map(str, alignments)) print "Interesting segments:", len(interesting_segments), sum( map(len, interesting_segments)) for seg in interesting_segments: print seg f = open(output_file, "w") tmp = [] for seg in interesting_segments: SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta") tmp.append(seg.Seq()) f.close() f1 = open(output_file + "1", "w") SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1, "fasta")
cur = None dump = open(sys.argv[2]).readlines() d = dict() for s in dump: s = s.strip() if s == "": continue if s.startswith("#"): s = s[1:].split() if s[0] == "Repeat": repeat = s[1] if s[0] in ["All", "Input", "Output"]: cur = s[1] else: if repeat not in interest: continue sign = s[0] s = s[1:] if s not in d: d[s] = [] d[s].append((repeat, cur, sign)) for rec in SeqIO.parse_fasta(open(sys.argv[1])): id = rec.id.split()[0] if id in d: tmp = d[id] if ("reads", "-") in [(a[1], a[2]) for a in tmp]: rec.seq = RC(rec.seq) SeqIO.write( common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])), sys.stdout, "fasta") sys.stderr.write(id + "_" + str(d[id]) + "\n")
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def OutputBroken(self, output_file): output = open(output_file, "w") for contig in self.contigs: for subcontig in self.Break(contig): SeqIO.write(subcontig, output, "fasta") output.close()
import sys sys.path.append("py") from common import basic, SeqIO from common.SimpleGraph import SimpleGraph graph = SimpleGraph().ReadGFA(sys.argv[1]) for e_id in graph.e: if basic.isCanonocal(e_id): SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
import sys import os sys.path.append("py") import common.SeqIO as SeqIO from common.basic import RC names = sys.argv[2:] tmp = dict() for s in names: s = s.split(",") tmp[s[0]] = s names = tmp for rec in SeqIO.parse_fasta(open(sys.argv[1], "r")): if rec.id in names: s = names[rec.id] if "RC" in s: rec.seq = RC(rec.seq) if "end" in s: rec.seq = rec.seq[-1000:] if "start" in s: rec.seq = rec.seq[:1000] SeqIO.write(rec, sys.stdout, "fasta")
tmp = dict() for s in names: if s.endswith("RC"): tmp[s[:-2]] = True else: tmp[s] = False names = tmp contigs_file = os.path.join(sys.argv[4], "contigs.fasta") contigs_handler = open(contigs_file, "w") for rec in SeqIO.parse_fasta(open(sys.argv[2], "r")): if rec.id in names: if names[rec.id]: rec.seq = RC(rec.seq) rec.id += "RC" SeqIO.write(rec, contigs_handler, "fasta") contigs_handler.close() alignment_dir = os.path.join(sys.argv[4], "alignment") if not os.path.exists(alignment_dir): os.makedirs(alignment_dir) alignment = os.path.join(sys.argv[4], "alignment.sam") make_alignment(contigs_file, [sys.argv[3]], 8, alignment_dir, "pacbio", alignment) aligned = set() for rec in sam_parser.Samfile(open(alignment, "r")): if rec.is_unmapped: continue aligned.add(rec.query_name)
def WriteSequences(self, reads, reads_file): # type: (Iterable[NamedSequence], str) -> None f = open(reads_file, "w") for read in reads: SeqIO.write(read, f, "fasta") f.close()
dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique())) res_file = os.path.join(dir, "res.fasta") rf = open(res_file, "w") for c in res: SeqIO.write(c, rf, "fasta") rf.close() aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap", os.path.join(dir, "res.sam"))
def PrintFasta(self, out): for e in self.e.values(): SeqIO.write(e, out, "fasta")