def loadFromFile(self, fname, num_names=True): # type: (str, bool) -> ContigCollection for rec in SeqIO.parse_by_name(fname): if num_names: self.add( Contig(rec.seq, str(basic.parseNegativeNumberAndMod(rec.id)))) else: self.add(Contig(rec.seq, rec.id)) return self
def main(args): flye_dir = sys.argv[1] repeats, starts, ends = parse(sys.argv[2]) graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir( flye_dir) dump = os.path.join(rep_dir, "read_alignment_dump") reads_file = sys.argv[3] dir = sys.argv[4] CreateLog(dir) print " ".join(args) print "Printing contigs" edges_file = os.path.join(rep_dir, "graph_before_rr.fasta") edges = ContigStorage().loadFromFasta(open(edges_file, "r")) unique = open(os.path.join(dir, "contigs"), "w") for l in starts: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[-15000:] SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique, "fasta") for l in ends: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[:15000] SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"), unique, "fasta") unique.close() print "Selecting reads" reads = set() cur_read = None als = [] for s in open(dump).readlines(): if s.startswith("Chain"): if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print als for al in als: if al in repeats: if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print "oppa" reads.add(cur_read) break als = [] else: s = s.split() cur_read = s[2][1:] eid = s[6].split("_")[1] if s[6][0] == "-": eid = "-" + eid if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print eid als.append(eid) print "Selected", len(reads), "reads" print "\n".join(reads) print "Reading and printing reads" freads = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 for read in SeqIO.parse_by_name(reads_file): cnt += 1 if cnt % 10000 == 0: print cnt if read.id in reads: SeqIO.write(read, freads, "fasta") freads.close()
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()