def export_read_units(self, pos2read): filenames = {} for pos in pos2read: outdir = os.path.join(self.params.outdir, f'pos_{pos}') units_fn = os.path.join(outdir, 'read_units.fasta') median_read_unit_fn = \ os.path.join(outdir, 'median_read_unit.fasta') smart_makedirs(outdir) seqs = {} median_read_unit, template_read = "", None for (r_id, p) in pos2read[pos]: r_al = self.motif_alignments[r_id][p].r_al r_al = r_al.upper().replace('-', '') seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al r_units_lens = [len(seq) for seq in seqs.values()] med_len = statistics.median_high(r_units_lens) median_r_ids = [] for r_id in sorted(seqs.keys()): r_al = seqs[r_id] if len(r_al) == med_len: median_read_unit = r_al template_read = r_id break assert len(seqs[template_read]) == med_len assert len(median_read_unit) == med_len write_bio_seqs(units_fn, seqs) write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit}) filenames[pos] = (units_fn, median_read_unit_fn) return filenames
def write_dot(self, outdir, reffn=None, refhpc=False, compact=False, export_pdf=True): if reffn is not None: # TODO make a parameter exact_matcher_bin = '/Poppy/abzikadze/DR/bin/exact_matcher' ref = read_bio_seq(reffn) if refhpc: ref = compress_homopolymer(ref) reffn_outfn = os.path.join(outdir, 'ref.fasta') write_bio_seqs(reffn_outfn, {'ref': ref}) exact_matcher_outfn = os.path.join(outdir, 'edge_matching.tsv') edges_fn = os.path.join( outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}.fasta') exact_matcher_cmd = \ f'{exact_matcher_bin} --output {exact_matcher_outfn} ' \ f'--reference {reffn_outfn} --query {edges_fn}' logger.info(f'Running exact matcher. Cmd: {exact_matcher_cmd}') exact_matcher_cmd = exact_matcher_cmd.split(' ') subprocess.call(exact_matcher_cmd) mult = defaultdict(lambda: [0, 0]) with open(exact_matcher_outfn) as f: f.readline() for line in f: line = line.strip().split('\t') _, index, pos, strand = line index, pos = int(index), int(pos) strand = strand != '+' # strand == '-' => 0 mult[index][strand] += 1 outfile = os.path.join(outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}') graph = nx.MultiDiGraph() for node in self.nx_graph.nodes(): graph.add_node(node, label=f'{node} len={self.node2len[node]}') for edge in self.nx_graph.edges(keys=True): index = self.edge2index[edge] seq = self.edge2seq[index] if not compact else None seqlen = len(self.edge2seq[index]) label = f'index={index}\nlen={seqlen}' if reffn is not None: # print(mult[index], mult_est[index]) # assert mult[index] == 0 or mult[index] >= mult_est[index] if mult[index] == [0, 0]: logger.info(f'Warning: edge {index} has [0, 0] coverage') label += f'\nmult_real={mult[index]}' graph.add_edge(*edge, label=label, seq=seq) dotfile = f'{outfile}.dot' nx.drawing.nx_pydot.write_dot(graph, dotfile) if export_pdf and self.nx_graph.size() < 500: pdffile = f'{outfile}.pdf' # https://stackoverflow.com/a/3516106 cmd = ['dot', '-Tpdf', dotfile, '-o', pdffile] call(cmd)
def export_results(self, final_sequences): for i in range(1, self.params.num_iters + 1): final_sequence = final_sequences[i] final_sequence_hpc = compress_homopolymer(final_sequence) final_fn = os.path.join(self.params.outdir, f'final_sequence_{i}.fasta') write_bio_seqs(final_fn, {f'polished_repeat_{i}': final_sequence}) final_hpc_fn = os.path.join(self.params.outdir, f'final_sequence_hpc_{i}.fasta') write_bio_seqs(final_hpc_fn, {f'polished_repeat_{i}': final_sequence_hpc})
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Directory with read units", required=True) # parser.add_argument("-r", # "--reads", # help="Input reads", # required=True) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("-b", "--bin-size", help="bin size", type=int, default=50) params = parser.parse_args() smart_makedirs(params.outdir) # reads = read_bio_seqs(params.reads) units = get_units(params.input) unit_lens = sorted(len(unit) for unit in units.values()) periods, bin_convs, bin_left, bin_right = \ get_period_info(unit_lens, bin_size=params.bin_size) # Currently support only one cluster filt_units = \ {k: v for k, v in units.items() if bin_left <= len(v) <= bin_right} filt_units_fn = os.path.join(params.outdir, 'cluster_units.fasta') write_bio_seqs(filt_units_fn, filt_units) median_unit_id, median_unit, median_len = select_median_seq(filt_units) median_read_unit_fn = os.path.join(params.outdir, 'median_read_unit.fasta') write_bio_seqs(median_read_unit_fn, {median_unit_id: median_unit}) cmd = [ 'flye', f'--nano-raw', filt_units_fn, '--polish-target', median_read_unit_fn, '-i', 2, '-t', 50, '-o', params.outdir ] cmd = [str(x) for x in cmd] subprocess.check_call(cmd)
def main(): params = parse_args() outdir = os.path.dirname(params.output) smart_makedirs(outdir) reads_ncrf_report = NCRF_Report(params.reads_ncrf) unit_seq = read_bio_seq(params.unit) kmer_counts_reads, most_frequent_kmers = \ get_most_frequent_kmers(reads_ncrf_report, k=params.k, unit_seq=unit_seq) new_unit = get_polished_unit(k=params.k, most_frequent_kmers=most_frequent_kmers, kmer_counts_reads=kmer_counts_reads, unit_seq=unit_seq) write_bio_seqs(params.output, {'DXZ1*': new_unit})
def run_on_read(seq, seq_id, k, bin_size, outdir): print("Getting repetitive kmers") rep_kmers = get_repetitive_kmers(seq, k) print("Getting union convolution") conv, union_conv = get_convolution(rep_kmers) print("Getting periods") periods, bin_convs, bin_left, bin_right = \ get_period_info(union_conv, bin_size=bin_size) print(f"Selected period = {periods[0]}") print("Getting hook") hook = get_hook_kmer(conv, bin_left, bin_right) if hook is None: return print("Splitting by hook") splits = split_by_hook(seq, hook) med_len = \ statistics.median_high([len(x) for x in splits.values()]) for r_id in sorted(splits.keys()): r_al = splits[r_id] if len(r_al) == med_len: median_read_unit = r_al template_read = r_id break read_outdir = os.path.join(outdir, seq_id[:8]) smart_makedirs(read_outdir) splits_outfile = os.path.join(read_outdir, 'splits.fasta') median_read_unit_fn = os.path.join(read_outdir, 'median_read_unit.fasta') write_bio_seqs(splits_outfile, splits) write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit}) print("Running Flye") cmd = [ 'flye', f'--nano-raw', splits_outfile, '--polish-target', median_read_unit_fn, '-i', 2, '-t', 50, '-o', read_outdir ] cmd = [str(x) for x in cmd] subprocess.check_call(cmd) plt.hist(union_conv, bins=100) plt.title(f'Tandem read convolution, {seq_id[:8]}, period={periods[0]}') plt.savefig(os.path.join(read_outdir, f'{seq_id[:8]}.pdf'), format='pdf') plt.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--ncrf", help="Input NCRF", required=True) parser.add_argument("--seq", help="Input sequence", required=True) parser.add_argument("--buf", help="Buffer on the sides to include", type=int, default=20) parser.add_argument("--outdir", help="Output dir", required=True) params = parser.parse_args() smart_makedirs(params.outdir) ncrf_report = NCRF_Report(params.ncrf) input_seq = read_bio_seq(params.seq) all_mas = ncrf_report.get_motif_alignments() for seq_id, mas in all_mas.items(): record = ncrf_report.records[seq_id] units = {} coords = {} al_start = record.r_st alignment = record.r_al.replace('-', '') start = 0 for ma in mas: ma_st = ma.start ma_en = ma.end seq_al = record.r_al[ma_st:ma_en] seq = seq_al.replace('-', '') end = start + len(seq) seq_st = input_seq[al_start + start - params.buf:al_start + start] seq_en = input_seq[al_start + end:end + al_start + params.buf] seq = seq_st + seq + seq_en ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}' units[ma_id] = seq coords[ma_id] = (start + al_start, end + al_start) # print(input_seq[start+al_start:end+al_start]) # print(seq[params.buf:-params.buf]) assert input_seq[start + al_start - len(seq_st):end + al_start + len(seq_en)] == seq start = end outfile = os.path.join(params.outdir, f'{seq_id}.fasta') write_bio_seqs(outfile, units)
def output_results(tr, left_flanked_tr, flanked_tr, all_muts, output_dir): smart_makedirs(output_dir) write_bio_seqs(os.path.join(output_dir, 'tandem_repeat.fasta'), {'sim_tr': tr}) write_bio_seqs(os.path.join(output_dir, 'left_flanked_tandem_repeat.fasta'), {'left_flanked_sim_tr': left_flanked_tr}) write_bio_seqs(os.path.join(output_dir, 'flanked_tandem_repeat.fasta'), {'flanked_sim_tr': flanked_tr}) with open(os.path.join(output_dir, 'all_muts.json'), 'w') as f: all_muts = dict(all_muts) all_muts = stringify_keys(all_muts) print(json.dumps(all_muts), file=f) with open(os.path.join(output_dir, 'simulation.log'), 'w') as f: total_n_mut = sum(len(x) for x in all_muts.values()) print(f'full_tr_len = {len(tr)}', file=f) print(f'total_n_mut = {total_n_mut}', file=f) for pos, muts in all_muts.items(): print(f'{pos} : {len(muts)}', file=f)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--reads", help="Path to centromeric reads in fasta format", required=True) parser.add_argument("--repeat", help="Path to the unit sequence", required=True) parser.add_argument("-t", "--threads", help="Number of threads", type=int, default=30) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("--ncrf-bin", help="Path to binary of NCRF", default='NCRF') params = parser.parse_args() smart_makedirs(params.outdir) repeat = read_bio_seq(params.repeat) reads = read_bio_seqs(params.reads) reads_split = chunks2(list(reads.keys()), params.threads) reads_chunks_fn = {} for i in range(len(reads_split)): reads_chunk = {k: reads[k] for k in reads_split[i]} outdir = os.path.join(params.outdir, 'split_reads') smart_makedirs(outdir) reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta') reads_chunks_fn[i] = reads_fn write_bio_seqs(reads_fn, reads_chunk) ps = [] ncrf_reports_fn = [] for i, fn in reads_chunks_fn.items(): outdir = os.path.join(params.outdir, 'ncrf_report') smart_makedirs(outdir) ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf') with open(ncrf_report_fn, 'w') as f: p1 = Popen(['cat', fn], stdout=PIPE) p2 = Popen([params.ncrf_bin, f'unit:{repeat}'], stdin=p1.stdout, stdout=f) ps.append(p2) ncrf_reports_fn.append(ncrf_report_fn) for p in ps: p.wait() final_report_fn = os.path.join(params.outdir, 'report.ncrf') with open(final_report_fn, 'w') as f: cmd1 = ['cat'] + ncrf_reports_fn p1 = Popen(cmd1, stdout=PIPE) cmd2 = f"grep -v -E end-of-file".split(' ') p2 = Popen(cmd2, stdin=p1.stdout, stdout=f) p2.wait() cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}' call(cmd.split(' '))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--dbg", required=True, help="Directory with DBG output") parser.add_argument("-o", "--outdir", required=True) parser.add_argument("--ref") parser.add_argument("--refhpc", action='store_true') parser.add_argument("--no_export_pdf", action='store_true') parser.add_argument("-K", type=int, default=40002) params = parser.parse_args() params.dbg = expandpath(params.dbg) params.outdir = expandpath(params.outdir) smart_makedirs(params.outdir) logfn = os.path.join(params.outdir, 'inc_k.log') global logger logger = get_logger(logfn, logger_name='centroFlye: inc_k') logger.info(f'cmd: {sys.argv}') logger.info(f'git hash: {get_git_revision_short_hash()}') db_fn = os.path.join(params.dbg, 'graph.fasta') align_fn = os.path.join(params.dbg, 'alignments.txt') dbg_log_fn = os.path.join(params.dbg, 'dbg.log') with open(dbg_log_fn) as f: cmd = f.readline().strip().split(' ') i = 0 while cmd[i] != '-k': i += 1 k = int(cmd[i+1]) + 1 logger.info(f'init k = {k}') logger.info(f'Reading DBG output from {params.dbg}') lpdb = PathMultiKGraph.fromDR(db_fn=db_fn, align_fn=align_fn, k=k, K=params.K) logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}') logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}') logger.info(f'Finished reading DBG output') logger.info(f'Starting increasing k') lpdb.transform_fast_until_saturated() logger.info(f'Finished increasing k') logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}') logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}') outac = os.path.join(params.outdir, f'active_connections.txt') logger.info(f'Active connections output to {outac}') with open(outac, 'w') as f: ac = lpdb.idb_mappings.get_active_connections() ac = sorted(list(ac)) for i, j in ac: print(f'{i} {j}', file=f) outuniquedges = os.path.join(params.outdir, f'unique_edges.txt') logger.info(f'Unique edges output to {outuniquedges}') with open(outuniquedges, 'w') as f: for index in sorted(list(lpdb.unique_edges)): print(index, file=f) outdot = os.path.join(params.outdir, f'dbg_{k}-{lpdb.init_k+lpdb.niter}') logger.info(f'Writing final graph to {outdot}') outfasta = outdot + '.fasta' logger.info(f'Writing graph edges to {outfasta}') edges = {key: ''.join(edge) for key, edge in lpdb.edge2seq.items()} write_bio_seqs(outfasta, edges) lpdb.write_dot(params.outdir, compact=True, reffn=params.ref, refhpc=params.refhpc, export_pdf=not params.no_export_pdf) logger.info(f'Finished writing final graph (dot)') out = open(outdot + ".graph", "w") for edge in lpdb.nx_graph.edges(keys=True): index = lpdb.edge2index[edge] seq = lpdb.edge2seq[index] out.write(">" + "_".join([str(index), str(edge[0]), str(lpdb.node2len[edge[0]]), str(edge[1]), str(lpdb.node2len[edge[1]])]) + "\n") out.write("".join(seq)) out.write("\n") out.close()
def polish(scaffolds, pseudounits, read_pseudounits, reads, monomers, outdir, n_iter, n_threads, flye_bin='flye'): def get_template(scaffold, st, en): return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1]) monomers = { m_id[0]: monomer for m_id, monomer in monomers.items() if m_id[-1] != "'" } smart_makedirs(outdir) for i, (scaffold, scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)): scaf_outdir = os.path.join(outdir, f'scaffold_{i}') smart_makedirs(scaf_outdir) polished_scaffold = [] for j, (s_st, s_en) in enumerate(scaf_pseudounits): pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}') smart_makedirs(pseudounit_outdir) # template = get_template(scaffold, s_st, s_en) # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}' # write_bio_seqs(template_fn, {template_id: template}) pseudounit_reads = {} for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items(): read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}' pseudounit_read = reads[r_id[0]][r_st:r_en + 1] if strand == '-': pseudounit_read = RC(pseudounit_read) pseudounit_reads[read_segm_id] = pseudounit_read reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta') write_bio_seqs(reads_fn, pseudounit_reads) template_fn = os.path.join(pseudounit_outdir, 'template.fasta') template_id, template_read = "", None r_units_lens = [len(read) for read in pseudounit_reads.values()] med_len = statistics.median_high(r_units_lens) for r_id in sorted(pseudounit_reads.keys()): read = pseudounit_reads[r_id] if len(read) == med_len: template_id = r_id template_read = read break assert len(pseudounit_reads[template_id]) == med_len assert len(template_read) == med_len write_bio_seqs(template_fn, {template_id: template_read}) cmd = [ flye_bin, '--nano-raw', reads_fn, '--polish-target', template_fn, '-i', n_iter, '-t', n_threads, '-o', pseudounit_outdir ] cmd = [str(x) for x in cmd] print(' '.join(cmd)) subprocess.check_call(cmd) try: polished_pseudounit_fn = \ os.path.join(pseudounit_outdir, f'polished_{n_iter}.fasta') polished_pseudounit = read_bio_seq(polished_pseudounit_fn) polished_scaffold.append(polished_pseudounit) except FileNotFoundError: polished_scaffold.append(template) polished_scaffold = ''.join(polished_scaffold) polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta') write_bio_seqs(polished_scaffold_fn, {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})