def main(): parser = argparse.ArgumentParser() parser.add_argument("--unit", help="Unit of tandem repeat. Default: random") parser.add_argument("--unit-len", help="Unit length used in case no unit is provided", type=int, default=200) parser.add_argument("--multiplicity", help="Multiplicity of the repeat to generate", required=True, type=int) parser.add_argument("--div-rate", help="Average divergence rate between blocks", type=float, required=True) parser.add_argument("-o", "--output", help="Output directory", required=True) parser.add_argument("--seed", help="Seed", type=int) params = parser.parse_args() if params.seed is not None: np.random.seed(params.seed) if params.unit is None: unit = gen_random_seq(length=params.unit_len) else: unit = read_bio_seq(params.unit) tr, left_flanked_tr, flanked_tr, all_muts = generate_mutations(unit, params.multiplicity, params.div_rate) output_results(tr, left_flanked_tr, flanked_tr, all_muts, params.output)
def get_units(input_dir): units = {} for f in os.scandir(input_dir): if f.is_dir(): polished_fn = os.path.join(f.path, 'polished_2.fasta') unit = read_bio_seq(polished_fn) units[os.path.basename(f.path)] = unit return units
def write_dot(self, outdir, reffn=None, refhpc=False, compact=False, export_pdf=True): if reffn is not None: # TODO make a parameter exact_matcher_bin = '/Poppy/abzikadze/DR/bin/exact_matcher' ref = read_bio_seq(reffn) if refhpc: ref = compress_homopolymer(ref) reffn_outfn = os.path.join(outdir, 'ref.fasta') write_bio_seqs(reffn_outfn, {'ref': ref}) exact_matcher_outfn = os.path.join(outdir, 'edge_matching.tsv') edges_fn = os.path.join( outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}.fasta') exact_matcher_cmd = \ f'{exact_matcher_bin} --output {exact_matcher_outfn} ' \ f'--reference {reffn_outfn} --query {edges_fn}' logger.info(f'Running exact matcher. Cmd: {exact_matcher_cmd}') exact_matcher_cmd = exact_matcher_cmd.split(' ') subprocess.call(exact_matcher_cmd) mult = defaultdict(lambda: [0, 0]) with open(exact_matcher_outfn) as f: f.readline() for line in f: line = line.strip().split('\t') _, index, pos, strand = line index, pos = int(index), int(pos) strand = strand != '+' # strand == '-' => 0 mult[index][strand] += 1 outfile = os.path.join(outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}') graph = nx.MultiDiGraph() for node in self.nx_graph.nodes(): graph.add_node(node, label=f'{node} len={self.node2len[node]}') for edge in self.nx_graph.edges(keys=True): index = self.edge2index[edge] seq = self.edge2seq[index] if not compact else None seqlen = len(self.edge2seq[index]) label = f'index={index}\nlen={seqlen}' if reffn is not None: # print(mult[index], mult_est[index]) # assert mult[index] == 0 or mult[index] >= mult_est[index] if mult[index] == [0, 0]: logger.info(f'Warning: edge {index} has [0, 0] coverage') label += f'\nmult_real={mult[index]}' graph.add_edge(*edge, label=label, seq=seq) dotfile = f'{outfile}.dot' nx.drawing.nx_pydot.write_dot(graph, dotfile) if export_pdf and self.nx_graph.size() < 500: pdffile = f'{outfile}.pdf' # https://stackoverflow.com/a/3516106 cmd = ['dot', '-Tpdf', dotfile, '-o', pdffile] call(cmd)
def read_polishing(self, read_unit_filenames): min_pos = min(read_unit_filenames.keys()) max_pos = max(read_unit_filenames.keys()) polished_seqs = {} final_sequences = {} for i in range(1, self.params.num_iters + 1): for pos, (units_fn, longest_read_unit_fn) in read_unit_filenames.items(): pos_dir = os.path.dirname(units_fn) polished_seq_fn = os.path.join(pos_dir, f'polished_{i}.fasta') polished_seq = read_bio_seq(polished_seq_fn) polished_seqs[pos] = polished_seq final_sequence = \ [polished_seqs[pos] for pos in range(min_pos, max_pos + 1)] final_sequence = ''.join(final_sequence) final_sequences[i] = final_sequence return final_sequences
def __init__(self, params): self.params = params if not os.path.isfile(params.unit): raise FileNotFoundError(f"File {params.unit} is not found") self.unit = read_bio_seq(params.unit) self.ncrf_report = NCRF_Report(params.ncrf) self.motif_alignments = self.ncrf_report.get_motif_alignments() smart_makedirs(params.outdir) self.read_placement = read_reported_positions(params.read_placement) self.max_pos = self.params.max_pos self.min_pos = self.params.min_pos if self.max_pos == math.inf: self.max_pos = 0 for r_id, pos in self.read_placement.items(): if pos is None: continue ma = self.motif_alignments[r_id] self.max_pos = max(self.max_pos, pos + len(ma))
def main(): params = parse_args() outdir = os.path.dirname(params.output) smart_makedirs(outdir) reads_ncrf_report = NCRF_Report(params.reads_ncrf) unit_seq = read_bio_seq(params.unit) kmer_counts_reads, most_frequent_kmers = \ get_most_frequent_kmers(reads_ncrf_report, k=params.k, unit_seq=unit_seq) new_unit = get_polished_unit(k=params.k, most_frequent_kmers=most_frequent_kmers, kmer_counts_reads=kmer_counts_reads, unit_seq=unit_seq) write_bio_seqs(params.output, {'DXZ1*': new_unit})
def main(): parser = argparse.ArgumentParser() parser.add_argument("--ncrf", help="Input NCRF", required=True) parser.add_argument("--seq", help="Input sequence", required=True) parser.add_argument("--buf", help="Buffer on the sides to include", type=int, default=20) parser.add_argument("--outdir", help="Output dir", required=True) params = parser.parse_args() smart_makedirs(params.outdir) ncrf_report = NCRF_Report(params.ncrf) input_seq = read_bio_seq(params.seq) all_mas = ncrf_report.get_motif_alignments() for seq_id, mas in all_mas.items(): record = ncrf_report.records[seq_id] units = {} coords = {} al_start = record.r_st alignment = record.r_al.replace('-', '') start = 0 for ma in mas: ma_st = ma.start ma_en = ma.end seq_al = record.r_al[ma_st:ma_en] seq = seq_al.replace('-', '') end = start + len(seq) seq_st = input_seq[al_start + start - params.buf:al_start + start] seq_en = input_seq[al_start + end:end + al_start + params.buf] seq = seq_st + seq + seq_en ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}' units[ma_id] = seq coords[ma_id] = (start + al_start, end + al_start) # print(input_seq[start+al_start:end+al_start]) # print(seq[params.buf:-params.buf]) assert input_seq[start + al_start - len(seq_st):end + al_start + len(seq_en)] == seq start = end outfile = os.path.join(params.outdir, f'{seq_id}.fasta') write_bio_seqs(outfile, units)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--reads", help="Path to centromeric reads in fasta format", required=True) parser.add_argument("--repeat", help="Path to the unit sequence", required=True) parser.add_argument("-t", "--threads", help="Number of threads", type=int, default=30) parser.add_argument("-o", "--outdir", help="Output directory", required=True) parser.add_argument("--ncrf-bin", help="Path to binary of NCRF", default='NCRF') params = parser.parse_args() smart_makedirs(params.outdir) repeat = read_bio_seq(params.repeat) reads = read_bio_seqs(params.reads) reads_split = chunks2(list(reads.keys()), params.threads) reads_chunks_fn = {} for i in range(len(reads_split)): reads_chunk = {k: reads[k] for k in reads_split[i]} outdir = os.path.join(params.outdir, 'split_reads') smart_makedirs(outdir) reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta') reads_chunks_fn[i] = reads_fn write_bio_seqs(reads_fn, reads_chunk) ps = [] ncrf_reports_fn = [] for i, fn in reads_chunks_fn.items(): outdir = os.path.join(params.outdir, 'ncrf_report') smart_makedirs(outdir) ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf') with open(ncrf_report_fn, 'w') as f: p1 = Popen(['cat', fn], stdout=PIPE) p2 = Popen([params.ncrf_bin, f'unit:{repeat}'], stdin=p1.stdout, stdout=f) ps.append(p2) ncrf_reports_fn.append(ncrf_report_fn) for p in ps: p.wait() final_report_fn = os.path.join(params.outdir, 'report.ncrf') with open(final_report_fn, 'w') as f: cmd1 = ['cat'] + ncrf_reports_fn p1 = Popen(cmd1, stdout=PIPE) cmd2 = f"grep -v -E end-of-file".split(' ') p2 = Popen(cmd2, stdin=p1.stdout, stdout=f) p2.wait() cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}' call(cmd.split(' '))
def polish(scaffolds, pseudounits, read_pseudounits, reads, monomers, outdir, n_iter, n_threads, flye_bin='flye'): def get_template(scaffold, st, en): return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1]) monomers = { m_id[0]: monomer for m_id, monomer in monomers.items() if m_id[-1] != "'" } smart_makedirs(outdir) for i, (scaffold, scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)): scaf_outdir = os.path.join(outdir, f'scaffold_{i}') smart_makedirs(scaf_outdir) polished_scaffold = [] for j, (s_st, s_en) in enumerate(scaf_pseudounits): pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}') smart_makedirs(pseudounit_outdir) # template = get_template(scaffold, s_st, s_en) # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}' # write_bio_seqs(template_fn, {template_id: template}) pseudounit_reads = {} for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items(): read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}' pseudounit_read = reads[r_id[0]][r_st:r_en + 1] if strand == '-': pseudounit_read = RC(pseudounit_read) pseudounit_reads[read_segm_id] = pseudounit_read reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta') write_bio_seqs(reads_fn, pseudounit_reads) template_fn = os.path.join(pseudounit_outdir, 'template.fasta') template_id, template_read = "", None r_units_lens = [len(read) for read in pseudounit_reads.values()] med_len = statistics.median_high(r_units_lens) for r_id in sorted(pseudounit_reads.keys()): read = pseudounit_reads[r_id] if len(read) == med_len: template_id = r_id template_read = read break assert len(pseudounit_reads[template_id]) == med_len assert len(template_read) == med_len write_bio_seqs(template_fn, {template_id: template_read}) cmd = [ flye_bin, '--nano-raw', reads_fn, '--polish-target', template_fn, '-i', n_iter, '-t', n_threads, '-o', pseudounit_outdir ] cmd = [str(x) for x in cmd] print(' '.join(cmd)) subprocess.check_call(cmd) try: polished_pseudounit_fn = \ os.path.join(pseudounit_outdir, f'polished_{n_iter}.fasta') polished_pseudounit = read_bio_seq(polished_pseudounit_fn) polished_scaffold.append(polished_pseudounit) except FileNotFoundError: polished_scaffold.append(template) polished_scaffold = ''.join(polished_scaffold) polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta') write_bio_seqs(polished_scaffold_fn, {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})