def export_read_units(self, pos2read): filenames = {} for pos in pos2read: outdir = os.path.join(self.params.outdir, f'pos_{pos}') units_fn = os.path.join(outdir, 'read_units.fasta') median_read_unit_fn = \ os.path.join(outdir, 'median_read_unit.fasta') smart_makedirs(outdir) seqs = {} median_read_unit, template_read = "", None for (r_id, p) in pos2read[pos]: r_al = self.motif_alignments[r_id][p].r_al r_al = r_al.upper().replace('-', '') seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al r_units_lens = [len(seq) for seq in seqs.values()] med_len = statistics.median_high(r_units_lens) median_r_ids = [] for r_id in sorted(seqs.keys()): r_al = seqs[r_id] if len(r_al) == med_len: median_read_unit = r_al template_read = r_id break assert len(seqs[template_read]) == med_len assert len(median_read_unit) == med_len write_bio_seqs(units_fn, seqs) write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit}) filenames[pos] = (units_fn, median_read_unit_fn) return filenames
def main(): params = parse_args() smart_makedirs(params.outdir) reads_ncrf_report = NCRF_Report(params.ncrf) rare_kmers = get_rare_kmers(reads_ncrf_report, k=params.k, bottom=params.bottom, top=params.top, coverage=params.coverage, kmer_survival_rate=params.kmer_survival_rate, max_nonuniq=params.max_nonuniq, verbose=params.verbose) reads_kmer_clouds = get_reads_kmer_clouds(reads_ncrf_report, n=1, k=params.k, genomic_kmers=rare_kmers) dist_cnt, kmer_index = get_kmer_dist_map(reads_kmer_clouds, rare_kmers, min_n=params.min_nreads, max_n=params.max_nreads, min_d=params.min_distance, max_d=params.max_distance, verbose=params.verbose) unique_kmers_ind, dist_edges = \ filter_dist_tuples(dist_cnt, min_coverage=params.min_coverage) output_results(kmer_index=kmer_index, min_coverage=params.min_coverage, unique_kmers_ind=unique_kmers_ind, dist_edges=dist_edges, outdir=params.outdir)
def from_read_db_and_assembly(cls, gr_reads, assembly, outdir=None): k = gr_reads.k gr_assembly, _ = sequence_graph.idb_graph.get_db_monostring_set( assembly, k=k, outdir=None, mode='assembly') color3graph = cls.from_db_graphs(gr_assembly=gr_assembly, gr_reads=gr_reads) if outdir is not None: smart_makedirs(outdir) asm_dot_file = os.path.join(outdir, f'db_asm_k{k}.dot') gr_assembly.write_dot(outfile=asm_dot_file, export_pdf=False) asm_dot_compact_file = os.path.join(outdir, f'db_asm_k{k}_compact.dot') gr_assembly.write_dot(outfile=asm_dot_compact_file, export_pdf=True, compact=True) asm_pickle_file = os.path.join(outdir, f'db_asm_k{k}.pickle') gr_assembly.pickle_dump(asm_pickle_file) c3g_dot_file = os.path.join(outdir, f'c3g_k{k}.dot') color3graph.write_dot(outfile=c3g_dot_file, export_pdf=True, compact=True) c3g_pickle_file = os.path.join(outdir, f'c3g_k{k}.pickle') color3graph.pickle_dump(c3g_pickle_file) return color3graph
def iterative_graph(monostrings, min_k, max_k, outdir, min_mult=5, step=1, starting_graph=None, verbose=True): smart_makedirs(outdir) dbs, all_contigs = {}, {} all_frequent_kmers, all_frequent_kmers_read_pos = {}, {} strings = {k: ''.join(v.string) for k, v in monostrings.items()} input_strings = strings.copy() complex_kp1mers = {} if starting_graph is not None: contigs, contig_paths = starting_graph.get_contigs() for i in range(len(contigs)): for j in range(min_mult): input_strings[f'contig_k{min_k}_i{i}_j{j}'] = contigs[i] complex_kp1mers = get_paths_thru_complex_nodes(starting_graph, strings) for k in range(min_k, max_k + 1, step): frequent_kmers, frequent_kmers_read_pos = \ get_frequent_kmers(input_strings, k=k, min_mult=min_mult) frequent_kmers.update(complex_kp1mers) if verbose: print(f'\nk={k}') print(f'#frequent kmers = {len(frequent_kmers)}') all_frequent_kmers[k] = frequent_kmers all_frequent_kmers_read_pos[k] = frequent_kmers_read_pos db = DeBruijnGraph(k=k) db.add_kmers(frequent_kmers, coverage=frequent_kmers) db.collapse_nonbranching_paths() if verbose and nx.number_weakly_connected_components(db.graph) > 1: print(f'#cc = {nx.number_weakly_connected_components(db.graph)}') for cc in nx.weakly_connected_components(db.graph): print(len(cc)) # break dbs[k] = db dot_file = os.path.join(outdir, f'db_k{k}.dot') # pdf_file = os.path.join(outdir, f'db_k{k}.pdf') nx.drawing.nx_pydot.write_dot(db.graph, dot_file) # os.system(f"dot -Tpdf {dot_file} -o {pdf_file}") contigs, contig_paths = db.get_contigs() all_contigs[k] = contigs input_strings = strings.copy() for i in range(len(contigs)): for j in range(min_mult): input_strings[f'contig_k{k}_i{i}_j{j}'] = contigs[i] complex_kp1mers = get_paths_thru_complex_nodes(db, strings) return all_contigs, dbs, all_frequent_kmers, all_frequent_kmers_read_pos
def __init__(self, params): self.params = params self.ncrf_report = NCRF_Report(params.ncrf) self.cloud_contig = CloudContig(params.min_cloud_kmer_freq) if params.genomic_kmers is not None: kmers = [] with open(params.genomic_kmers) as f: for line in f: kmers.append(line.strip()) self.genomic_kmers = set(kmers) else: self.genomic_kmers = None smart_makedirs(params.outdir) self.position_outfile = \ os.path.join(self.params.outdir, 'read_positions.csv')
def toDB(self, outdir=None, assembly=None): nx_graph = nx.MultiDiGraph() nodeindex2label = {} nodelabel2index = {} for i, (u, v, key) in self.index2edge.items(): seq = tuple(self.edge2seq[i]) u_label = seq[:self.k - 1] v_label = seq[-self.k + 1:] nodelabel2index[u_label] = u nodelabel2index[v_label] = v nodeindex2label[u] = u_label nodeindex2label[v] = v_label edge_len = len(seq) - self.k + 1 cov = [1] * edge_len mean_cov = np.mean(cov) label = f'index={i}\nlen={edge_len}\ncov={mean_cov:0.2f}' nx_graph.add_edge(u, v, key=key, coverage=cov, edge_index=i, edge_len=edge_len, label=label, string=seq, color='black') db = DeBruijnGraph(k=self.k, nx_graph=nx_graph, nodeindex2label=nodeindex2label, nodelabel2index=nodelabel2index) if outdir is not None: smart_makedirs(outdir) dot_file = os.path.join(outdir, f'db_K{self.k}.dot') db.write_dot(outfile=dot_file, export_pdf=False) dot_compact_file = os.path.join(outdir, f'db_K{self.k}_compact.dot') db.write_dot(outfile=dot_compact_file, export_pdf=True, compact=True) db.write_dot(outfile=dot_file, export_pdf=False) pickle_file = os.path.join(outdir, f'db_K{self.k}.pickle') db.pickle_dump(pickle_file) if assembly is not None: DeBruijnGraph3Color.from_read_db_and_assembly( gr_reads=db, assembly=assembly, outdir=outdir) return db
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Directory with read units", required=True) # parser.add_argument("-r", # "--reads", # help="Input reads", # required=True) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("-b", "--bin-size", help="bin size", type=int, default=50) params = parser.parse_args() smart_makedirs(params.outdir) # reads = read_bio_seqs(params.reads) units = get_units(params.input) unit_lens = sorted(len(unit) for unit in units.values()) periods, bin_convs, bin_left, bin_right = \ get_period_info(unit_lens, bin_size=params.bin_size) # Currently support only one cluster filt_units = \ {k: v for k, v in units.items() if bin_left <= len(v) <= bin_right} filt_units_fn = os.path.join(params.outdir, 'cluster_units.fasta') write_bio_seqs(filt_units_fn, filt_units) median_unit_id, median_unit, median_len = select_median_seq(filt_units) median_read_unit_fn = os.path.join(params.outdir, 'median_read_unit.fasta') write_bio_seqs(median_read_unit_fn, {median_unit_id: median_unit}) cmd = [ 'flye', f'--nano-raw', filt_units_fn, '--polish-target', median_read_unit_fn, '-i', 2, '-t', 50, '-o', params.outdir ] cmd = [str(x) for x in cmd] subprocess.check_call(cmd)
def __init__(self, params): self.params = params if not os.path.isfile(params.unit): raise FileNotFoundError(f"File {params.unit} is not found") self.unit = read_bio_seq(params.unit) self.ncrf_report = NCRF_Report(params.ncrf) self.motif_alignments = self.ncrf_report.get_motif_alignments() smart_makedirs(params.outdir) self.read_placement = read_reported_positions(params.read_placement) self.max_pos = self.params.max_pos self.min_pos = self.params.min_pos if self.max_pos == math.inf: self.max_pos = 0 for r_id, pos in self.read_placement.items(): if pos is None: continue ma = self.motif_alignments[r_id] self.max_pos = max(self.max_pos, pos + len(ma))
def run_on_read(seq, seq_id, k, bin_size, outdir): print("Getting repetitive kmers") rep_kmers = get_repetitive_kmers(seq, k) print("Getting union convolution") conv, union_conv = get_convolution(rep_kmers) print("Getting periods") periods, bin_convs, bin_left, bin_right = \ get_period_info(union_conv, bin_size=bin_size) print(f"Selected period = {periods[0]}") print("Getting hook") hook = get_hook_kmer(conv, bin_left, bin_right) if hook is None: return print("Splitting by hook") splits = split_by_hook(seq, hook) med_len = \ statistics.median_high([len(x) for x in splits.values()]) for r_id in sorted(splits.keys()): r_al = splits[r_id] if len(r_al) == med_len: median_read_unit = r_al template_read = r_id break read_outdir = os.path.join(outdir, seq_id[:8]) smart_makedirs(read_outdir) splits_outfile = os.path.join(read_outdir, 'splits.fasta') median_read_unit_fn = os.path.join(read_outdir, 'median_read_unit.fasta') write_bio_seqs(splits_outfile, splits) write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit}) print("Running Flye") cmd = [ 'flye', f'--nano-raw', splits_outfile, '--polish-target', median_read_unit_fn, '-i', 2, '-t', 50, '-o', read_outdir ] cmd = [str(x) for x in cmd] subprocess.check_call(cmd) plt.hist(union_conv, bins=100) plt.title(f'Tandem read convolution, {seq_id[:8]}, period={periods[0]}') plt.savefig(os.path.join(read_outdir, f'{seq_id[:8]}.pdf'), format='pdf') plt.close()
def main(): params = parse_args() outdir = os.path.dirname(params.output) smart_makedirs(outdir) reads_ncrf_report = NCRF_Report(params.reads_ncrf) unit_seq = read_bio_seq(params.unit) kmer_counts_reads, most_frequent_kmers = \ get_most_frequent_kmers(reads_ncrf_report, k=params.k, unit_seq=unit_seq) new_unit = get_polished_unit(k=params.k, most_frequent_kmers=most_frequent_kmers, kmer_counts_reads=kmer_counts_reads, unit_seq=unit_seq) write_bio_seqs(params.output, {'DXZ1*': new_unit})
def output_results(tr, left_flanked_tr, flanked_tr, all_muts, output_dir): smart_makedirs(output_dir) write_bio_seqs(os.path.join(output_dir, 'tandem_repeat.fasta'), {'sim_tr': tr}) write_bio_seqs(os.path.join(output_dir, 'left_flanked_tandem_repeat.fasta'), {'left_flanked_sim_tr': left_flanked_tr}) write_bio_seqs(os.path.join(output_dir, 'flanked_tandem_repeat.fasta'), {'flanked_sim_tr': flanked_tr}) with open(os.path.join(output_dir, 'all_muts.json'), 'w') as f: all_muts = dict(all_muts) all_muts = stringify_keys(all_muts) print(json.dumps(all_muts), file=f) with open(os.path.join(output_dir, 'simulation.log'), 'w') as f: total_n_mut = sum(len(x) for x in all_muts.values()) print(f'full_tr_len = {len(tr)}', file=f) print(f'total_n_mut = {total_n_mut}', file=f) for pos, muts in all_muts.items(): print(f'{pos} : {len(muts)}', file=f)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--ncrf", help="Input NCRF", required=True) parser.add_argument("--seq", help="Input sequence", required=True) parser.add_argument("--buf", help="Buffer on the sides to include", type=int, default=20) parser.add_argument("--outdir", help="Output dir", required=True) params = parser.parse_args() smart_makedirs(params.outdir) ncrf_report = NCRF_Report(params.ncrf) input_seq = read_bio_seq(params.seq) all_mas = ncrf_report.get_motif_alignments() for seq_id, mas in all_mas.items(): record = ncrf_report.records[seq_id] units = {} coords = {} al_start = record.r_st alignment = record.r_al.replace('-', '') start = 0 for ma in mas: ma_st = ma.start ma_en = ma.end seq_al = record.r_al[ma_st:ma_en] seq = seq_al.replace('-', '') end = start + len(seq) seq_st = input_seq[al_start + start - params.buf:al_start + start] seq_en = input_seq[al_start + end:end + al_start + params.buf] seq = seq_st + seq + seq_en ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}' units[ma_id] = seq coords[ma_id] = (start + al_start, end + al_start) # print(input_seq[start+al_start:end+al_start]) # print(seq[params.buf:-params.buf]) assert input_seq[start + al_start - len(seq_st):end + al_start + len(seq_en)] == seq start = end outfile = os.path.join(params.outdir, f'{seq_id}.fasta') write_bio_seqs(outfile, units)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="input reads", required=True) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("-k", help="kmer len", type=int, default=15) parser.add_argument("-b", "--bin-size", help="bin size", type=int, default=10) params = parser.parse_args() smart_makedirs(params.outdir) reads = read_bio_seqs(params.input) for r_id, seq in reads.items(): run_on_read(seq, seq_id=r_id, k=params.k, bin_size=params.bin_size, outdir=params.outdir)
def get_idb(string_set, mink, maxk, outdir, mode='ont', assembly=None, get_min_mult=None, get_frequent_kmers=None, all_kmer_index=None, ignored_chars=None, step=1): if outdir is not None: logger.info(f'IDB will be saved to {outdir}') smart_makedirs(outdir) else: logger.info('IDB will not be saved — outdir is None') assert mode in ['ont', 'hifi', 'assembly'] if get_min_mult is None: get_min_mult = def_get_min_mult if get_frequent_kmers is None: get_frequent_kmers = def_get_frequent_kmers if all_kmer_index is None: all_kmer_index = get_kmer_index(seqs=string_set, mink=mink, maxk=maxk, ignored_chars=ignored_chars) else: assert all(k in all_kmer_index.keys() for k in range(mink, maxk+1, step)) dbs = {} all_frequent_kmers = {} contig_kmers = {} complex_kp1mers = {} for k in range(mink, maxk+1, step): min_mult = get_min_mult(k=k, mode=mode) kmer_index = all_kmer_index[k] frequent_kmers = get_frequent_kmers(kmer_index=kmer_index, string_set=string_set, min_mult=min_mult) # extending frequent kmers with contig kmers for kmer, cnt in contig_kmers.items(): if kmer not in frequent_kmers: frequent_kmers[kmer] = cnt # extending frequent kmers with k+1-mers that pass through complex # nodes for kmer, cnt in complex_kp1mers.items(): if kmer in frequent_kmers: assert cnt == frequent_kmers[kmer] frequent_kmers.update(complex_kp1mers) all_frequent_kmers[k] = frequent_kmers logger.info(f'k={k}') logger.info(f'#frequent kmers = {len(frequent_kmers)}') logger.info(f'min_mult = {min_mult}') db = DeBruijnGraph.from_kmers(kmers=frequent_kmers.keys(), kmer_coverages=frequent_kmers, min_tip_cov=min_mult) ncc = nx.number_weakly_connected_components(db.nx_graph) logger.info(f'#cc = {ncc}') for i, cc in enumerate(nx.weakly_connected_components(db.nx_graph)): logger.info(f'{i}-th cc is of size = {len(cc)}') if outdir is not None: dot_file = os.path.join(outdir, f'db_k{k}.dot') db.write_dot(outfile=dot_file, export_pdf=False) dot_compact_file = os.path.join(outdir, f'db_k{k}_compact.dot') db.write_dot(outfile=dot_compact_file, export_pdf=True, compact=True) pickle_file = os.path.join(outdir, f'db_k{k}.pickle') db.pickle_dump(pickle_file) if assembly is not None: sequence_graph.db_graph_3col.DeBruijnGraph3Color.\ from_read_db_and_assembly(gr_reads=db, assembly=assembly, outdir=outdir) dbs[k] = db if k < maxk: contigs, _ = db.get_contigs() contig_kmers = Counter() for contig in contigs: for i in range(len(contig)-(k+1)+1): kmer = contig[i:i+k+1] contig_kmers[kmer] += 1 complex_kp1mers = \ db.get_paths_thru_complex_nodes(all_kmer_index[k+1]) return dbs, all_frequent_kmers
def main(): params = parse_args() smart_makedirs(params.outdir) print('Reading report') sd_report = SD_Report(SD_report_fn=params.sd_report, monomers_fn=params.monomers) print('Error correcting monoreads') ec_monostrings = error_correction(sd_report.monostrings, verbose=True, inplace=False) print('Building the graph') contigs, dbs, all_frequent_kmers, all_frequent_kmers_read_pos = \ iterative_graph(ec_monostrings, min_k=params.min_k, max_k=params.max_k, outdir=os.path.join(params.outdir, 'idb'), min_mult=params.min_mult) db = dbs[params.max_k] print('Mapping reads to the graph') mappings = db.map_reads(ec_monostrings, verbose=False) print('Scaffolding') scaffolds, edge_scaffolds = scaffolding(db, mappings) # Manual connection of two scaffolds for cen6 # TODO cen6_scaffold = scaffolds[0] + scaffolds[1][db.k - 1:] cen6_edge_scaffold = edge_scaffolds[0] + edge_scaffolds[1] print('Mapping reads to scaffolds') r2s = read2scaffolds(db, [cen6_edge_scaffold], mappings, ec_monostrings) print('Covering scaffolds with reads') scaf_read_coverage = cover_scaffolds_w_reads(r2s, mappings, [cen6_scaffold], ec_monostrings, k=db.k) print('Extracting pseudounits and reads covering them') pseudounits, read_pseudounits = \ extract_read_pseudounits(scaf_read_coverage, [cen6_scaffold], monostrings=ec_monostrings) print('Reading centromeric reads') centromeric_reads = read_bio_seqs(params.centromeric_reads) monomers = read_bio_seqs(params.monomers) print('Polishing') polish(scaffolds=[cen6_scaffold], pseudounits=pseudounits, read_pseudounits=read_pseudounits, reads=centromeric_reads, monomers=monomers, outdir=os.path.join(params.outdir, 'polishing'), n_iter=params.polish_n_iter, n_threads=params.polish_n_threads)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--dbg", required=True, help="Directory with DBG output") parser.add_argument("-o", "--outdir", required=True) parser.add_argument("--ref") parser.add_argument("--refhpc", action='store_true') parser.add_argument("--no_export_pdf", action='store_true') parser.add_argument("-K", type=int, default=40002) params = parser.parse_args() params.dbg = expandpath(params.dbg) params.outdir = expandpath(params.outdir) smart_makedirs(params.outdir) logfn = os.path.join(params.outdir, 'inc_k.log') global logger logger = get_logger(logfn, logger_name='centroFlye: inc_k') logger.info(f'cmd: {sys.argv}') logger.info(f'git hash: {get_git_revision_short_hash()}') db_fn = os.path.join(params.dbg, 'graph.fasta') align_fn = os.path.join(params.dbg, 'alignments.txt') dbg_log_fn = os.path.join(params.dbg, 'dbg.log') with open(dbg_log_fn) as f: cmd = f.readline().strip().split(' ') i = 0 while cmd[i] != '-k': i += 1 k = int(cmd[i+1]) + 1 logger.info(f'init k = {k}') logger.info(f'Reading DBG output from {params.dbg}') lpdb = PathMultiKGraph.fromDR(db_fn=db_fn, align_fn=align_fn, k=k, K=params.K) logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}') logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}') logger.info(f'Finished reading DBG output') logger.info(f'Starting increasing k') lpdb.transform_fast_until_saturated() logger.info(f'Finished increasing k') logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}') logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}') outac = os.path.join(params.outdir, f'active_connections.txt') logger.info(f'Active connections output to {outac}') with open(outac, 'w') as f: ac = lpdb.idb_mappings.get_active_connections() ac = sorted(list(ac)) for i, j in ac: print(f'{i} {j}', file=f) outuniquedges = os.path.join(params.outdir, f'unique_edges.txt') logger.info(f'Unique edges output to {outuniquedges}') with open(outuniquedges, 'w') as f: for index in sorted(list(lpdb.unique_edges)): print(index, file=f) outdot = os.path.join(params.outdir, f'dbg_{k}-{lpdb.init_k+lpdb.niter}') logger.info(f'Writing final graph to {outdot}') outfasta = outdot + '.fasta' logger.info(f'Writing graph edges to {outfasta}') edges = {key: ''.join(edge) for key, edge in lpdb.edge2seq.items()} write_bio_seqs(outfasta, edges) lpdb.write_dot(params.outdir, compact=True, reffn=params.ref, refhpc=params.refhpc, export_pdf=not params.no_export_pdf) logger.info(f'Finished writing final graph (dot)') out = open(outdot + ".graph", "w") for edge in lpdb.nx_graph.edges(keys=True): index = lpdb.edge2index[edge] seq = lpdb.edge2seq[index] out.write(">" + "_".join([str(index), str(edge[0]), str(lpdb.node2len[edge[0]]), str(edge[1]), str(lpdb.node2len[edge[1]])]) + "\n") out.write("".join(seq)) out.write("\n") out.close()
def polish(scaffolds, pseudounits, read_pseudounits, reads, monomers, outdir, n_iter, n_threads, flye_bin='flye'): def get_template(scaffold, st, en): return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1]) monomers = { m_id[0]: monomer for m_id, monomer in monomers.items() if m_id[-1] != "'" } smart_makedirs(outdir) for i, (scaffold, scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)): scaf_outdir = os.path.join(outdir, f'scaffold_{i}') smart_makedirs(scaf_outdir) polished_scaffold = [] for j, (s_st, s_en) in enumerate(scaf_pseudounits): pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}') smart_makedirs(pseudounit_outdir) # template = get_template(scaffold, s_st, s_en) # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}' # write_bio_seqs(template_fn, {template_id: template}) pseudounit_reads = {} for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items(): read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}' pseudounit_read = reads[r_id[0]][r_st:r_en + 1] if strand == '-': pseudounit_read = RC(pseudounit_read) pseudounit_reads[read_segm_id] = pseudounit_read reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta') write_bio_seqs(reads_fn, pseudounit_reads) template_fn = os.path.join(pseudounit_outdir, 'template.fasta') template_id, template_read = "", None r_units_lens = [len(read) for read in pseudounit_reads.values()] med_len = statistics.median_high(r_units_lens) for r_id in sorted(pseudounit_reads.keys()): read = pseudounit_reads[r_id] if len(read) == med_len: template_id = r_id template_read = read break assert len(pseudounit_reads[template_id]) == med_len assert len(template_read) == med_len write_bio_seqs(template_fn, {template_id: template_read}) cmd = [ flye_bin, '--nano-raw', reads_fn, '--polish-target', template_fn, '-i', n_iter, '-t', n_threads, '-o', pseudounit_outdir ] cmd = [str(x) for x in cmd] print(' '.join(cmd)) subprocess.check_call(cmd) try: polished_pseudounit_fn = \ os.path.join(pseudounit_outdir, f'polished_{n_iter}.fasta') polished_pseudounit = read_bio_seq(polished_pseudounit_fn) polished_scaffold.append(polished_pseudounit) except FileNotFoundError: polished_scaffold.append(template) polished_scaffold = ''.join(polished_scaffold) polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta') write_bio_seqs(polished_scaffold_fn, {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})
def main(): parser = argparse.ArgumentParser() parser.add_argument("--reads", help="Path to centromeric reads in fasta format", required=True) parser.add_argument("--repeat", help="Path to the unit sequence", required=True) parser.add_argument("-t", "--threads", help="Number of threads", type=int, default=30) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("--ncrf-bin", help="Path to binary of NCRF", default='NCRF') params = parser.parse_args() smart_makedirs(params.outdir) repeat = read_bio_seq(params.repeat) reads = read_bio_seqs(params.reads) reads_split = chunks2(list(reads.keys()), params.threads) reads_chunks_fn = {} for i in range(len(reads_split)): reads_chunk = {k: reads[k] for k in reads_split[i]} outdir = os.path.join(params.outdir, 'split_reads') smart_makedirs(outdir) reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta') reads_chunks_fn[i] = reads_fn write_bio_seqs(reads_fn, reads_chunk) ps = [] ncrf_reports_fn = [] for i, fn in reads_chunks_fn.items(): outdir = os.path.join(params.outdir, 'ncrf_report') smart_makedirs(outdir) ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf') with open(ncrf_report_fn, 'w') as f: p1 = Popen(['cat', fn], stdout=PIPE) p2 = Popen([params.ncrf_bin, f'unit:{repeat}'], stdin=p1.stdout, stdout=f) ps.append(p2) ncrf_reports_fn.append(ncrf_report_fn) for p in ps: p.wait() final_report_fn = os.path.join(params.outdir, 'report.ncrf') with open(final_report_fn, 'w') as f: cmd1 = ['cat'] + ncrf_reports_fn p1 = Popen(cmd1, stdout=PIPE) cmd2 = f"grep -v -E end-of-file".split(' ') p2 = Popen(cmd2, stdin=p1.stdout, stdout=f) p2.wait() cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}' call(cmd.split(' '))
def map_strings(self, string_set, overlap_penalty, neutral_symbs, only_unique_paths=False, outdir=None, n_threads=config['common']['threads'], min_len=None): logger.info('Mapping monostrings to graph') logger.info('Computing overlaps') if min_len is None: logger.info('No min len parameter. All strings will be aligned') else: logger.info(f'Only strings longer than {min_len} will be aligned') total_reads = len(string_set) string_set = { s_id: string for s_id, string in string_set.items() if len(string) >= min_len } long_reads = len(string_set) logger.info(f'{long_reads} / {total_reads} longer than {min_len}') overlaps, excessive_overlaps = \ find_overlaps(graph=self, string_set=string_set, overlap_penalty=overlap_penalty, neutral_symbs=neutral_symbs, n_threads=n_threads) # print(overlaps) logger.info('Computing chains') chains = get_chains(graph=self, overlaps=overlaps, n_threads=n_threads) unmapped = { s_id for s_id, s_chains in chains.items() if len(s_chains) == 0 } logger.info(f'{len(unmapped)} strings are unmapped') logger.info(f'That includes {len(excessive_overlaps)} reads ' f'with too many overlaps (see config)') unique_mapping = { s_id for s_id, s_chains in chains.items() if len(s_chains) == 1 } logger.info(f'{len(unique_mapping)} strings are uniquely mapped') if outdir is not None: smart_makedirs(outdir) unmapped_fn = os.path.join(outdir, 'unmapped.txt') with open(unmapped_fn, 'w') as f: for s_id in unmapped: print(s_id, file=f) excessive_fn = os.path.join(outdir, 'excessive.txt') with open(excessive_fn, 'w') as f: print('s_id', '#overlaps', file=f) for s_id, s_overlaps in excessive_overlaps.items(): print(s_id, len(s_overlaps), file=f) chains_fn = os.path.join(outdir, 'chains.txt') with open(chains_fn, 'w') as f: for s_id, s_chains in chains.items(): for chain in s_chains: print(s_id, chain, file=f) unique_fn = os.path.join(outdir, 'unique.txt') with open(unique_fn, 'w') as f: for s_id in unique_mapping: print(s_id, file=f) paths = defaultdict(list) for r_id, chains_r_id in chains.items(): for chain in chains_r_id: path = [overlap.edge for overlap in chain.overlap_list] e_st = chain.overlap_list[0].e_st e_en = chain.overlap_list[-1].e_en paths[r_id].append((path, e_st, e_en)) if only_unique_paths: paths = { r_id: paths_r_id[0] for r_id, paths_r_id in paths.items() if len(paths_r_id) == 1 } return paths