示例#1
0
 def export_read_units(self, pos2read):
     filenames = {}
     for pos in pos2read:
         outdir = os.path.join(self.params.outdir, f'pos_{pos}')
         units_fn = os.path.join(outdir, 'read_units.fasta')
         median_read_unit_fn = \
             os.path.join(outdir, 'median_read_unit.fasta')
         smart_makedirs(outdir)
         seqs = {}
         median_read_unit, template_read = "", None
         for (r_id, p) in pos2read[pos]:
             r_al = self.motif_alignments[r_id][p].r_al
             r_al = r_al.upper().replace('-', '')
             seqs[f'gen_pos={pos}|r_id={r_id}|r_pos={p}'] = r_al
         r_units_lens = [len(seq) for seq in seqs.values()]
         med_len = statistics.median_high(r_units_lens)
         median_r_ids = []
         for r_id in sorted(seqs.keys()):
             r_al = seqs[r_id]
             if len(r_al) == med_len:
                 median_read_unit = r_al
                 template_read = r_id
                 break
         assert len(seqs[template_read]) == med_len
         assert len(median_read_unit) == med_len
         write_bio_seqs(units_fn, seqs)
         write_bio_seqs(median_read_unit_fn,
                        {template_read: median_read_unit})
         filenames[pos] = (units_fn, median_read_unit_fn)
     return filenames
示例#2
0
    def write_dot(self, outdir, reffn=None, refhpc=False,
                  compact=False, export_pdf=True):
        if reffn is not None:
            # TODO make a parameter
            exact_matcher_bin = '/Poppy/abzikadze/DR/bin/exact_matcher'
            ref = read_bio_seq(reffn)
            if refhpc:
                ref = compress_homopolymer(ref)
            reffn_outfn = os.path.join(outdir, 'ref.fasta')
            write_bio_seqs(reffn_outfn, {'ref': ref})
            exact_matcher_outfn = os.path.join(outdir, 'edge_matching.tsv')
            edges_fn = os.path.join(
                outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}.fasta')
            exact_matcher_cmd = \
                f'{exact_matcher_bin} --output {exact_matcher_outfn} ' \
                f'--reference {reffn_outfn} --query {edges_fn}'
            logger.info(f'Running exact matcher. Cmd: {exact_matcher_cmd}')
            exact_matcher_cmd = exact_matcher_cmd.split(' ')

            subprocess.call(exact_matcher_cmd)

            mult = defaultdict(lambda: [0, 0])
            with open(exact_matcher_outfn) as f:
                f.readline()
                for line in f:
                    line = line.strip().split('\t')
                    _, index, pos, strand = line
                    index, pos = int(index), int(pos)
                    strand = strand != '+'  # strand == '-' => 0
                    mult[index][strand] += 1

        outfile = os.path.join(outdir,
                               f'dbg_{self.init_k}-{self.init_k+self.niter}')
        graph = nx.MultiDiGraph()
        for node in self.nx_graph.nodes():
            graph.add_node(node, label=f'{node} len={self.node2len[node]}')
        for edge in self.nx_graph.edges(keys=True):
            index = self.edge2index[edge]
            seq = self.edge2seq[index] if not compact else None
            seqlen = len(self.edge2seq[index])
            label = f'index={index}\nlen={seqlen}'
            if reffn is not None:
                # print(mult[index], mult_est[index])
                # assert mult[index] == 0 or mult[index] >= mult_est[index]
                if mult[index] == [0, 0]:
                    logger.info(f'Warning: edge {index} has [0, 0] coverage')
                label += f'\nmult_real={mult[index]}'
            graph.add_edge(*edge,
                           label=label,
                           seq=seq)
        dotfile = f'{outfile}.dot'
        nx.drawing.nx_pydot.write_dot(graph, dotfile)
        if export_pdf and self.nx_graph.size() < 500:
            pdffile = f'{outfile}.pdf'
            # https://stackoverflow.com/a/3516106
            cmd = ['dot', '-Tpdf', dotfile, '-o', pdffile]
            call(cmd)
示例#3
0
    def export_results(self, final_sequences):
        for i in range(1, self.params.num_iters + 1):
            final_sequence = final_sequences[i]
            final_sequence_hpc = compress_homopolymer(final_sequence)

            final_fn = os.path.join(self.params.outdir, f'final_sequence_{i}.fasta')
            write_bio_seqs(final_fn, {f'polished_repeat_{i}': final_sequence})

            final_hpc_fn = os.path.join(self.params.outdir, f'final_sequence_hpc_{i}.fasta')
            write_bio_seqs(final_hpc_fn, {f'polished_repeat_{i}': final_sequence_hpc})
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Directory with read units",
                        required=True)
    # parser.add_argument("-r",
    #                     "--reads",
    #                     help="Input reads",
    #                     required=True)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("-b",
                        "--bin-size",
                        help="bin size",
                        type=int,
                        default=50)
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    # reads = read_bio_seqs(params.reads)

    units = get_units(params.input)
    unit_lens = sorted(len(unit) for unit in units.values())
    periods, bin_convs, bin_left, bin_right = \
        get_period_info(unit_lens, bin_size=params.bin_size)

    # Currently support only one cluster
    filt_units = \
        {k: v for k, v in units.items() if bin_left <= len(v) <= bin_right}
    filt_units_fn = os.path.join(params.outdir, 'cluster_units.fasta')
    write_bio_seqs(filt_units_fn, filt_units)

    median_unit_id, median_unit, median_len = select_median_seq(filt_units)
    median_read_unit_fn = os.path.join(params.outdir, 'median_read_unit.fasta')
    write_bio_seqs(median_read_unit_fn, {median_unit_id: median_unit})

    cmd = [
        'flye', f'--nano-raw', filt_units_fn, '--polish-target',
        median_read_unit_fn, '-i', 2, '-t', 50, '-o', params.outdir
    ]
    cmd = [str(x) for x in cmd]
    subprocess.check_call(cmd)
def main():
    params = parse_args()
    outdir = os.path.dirname(params.output)
    smart_makedirs(outdir)

    reads_ncrf_report = NCRF_Report(params.reads_ncrf)
    unit_seq = read_bio_seq(params.unit)

    kmer_counts_reads, most_frequent_kmers = \
        get_most_frequent_kmers(reads_ncrf_report,
                                k=params.k,
                                unit_seq=unit_seq)

    new_unit = get_polished_unit(k=params.k,
                                 most_frequent_kmers=most_frequent_kmers,
                                 kmer_counts_reads=kmer_counts_reads,
                                 unit_seq=unit_seq)

    write_bio_seqs(params.output, {'DXZ1*': new_unit})
示例#6
0
def run_on_read(seq, seq_id, k, bin_size, outdir):
    print("Getting repetitive kmers")
    rep_kmers = get_repetitive_kmers(seq, k)
    print("Getting union convolution")
    conv, union_conv = get_convolution(rep_kmers)
    print("Getting periods")
    periods, bin_convs, bin_left, bin_right = \
        get_period_info(union_conv, bin_size=bin_size)
    print(f"Selected period = {periods[0]}")
    print("Getting hook")
    hook = get_hook_kmer(conv, bin_left, bin_right)
    if hook is None:
        return
    print("Splitting by hook")
    splits = split_by_hook(seq, hook)
    med_len = \
        statistics.median_high([len(x) for x in splits.values()])

    for r_id in sorted(splits.keys()):
        r_al = splits[r_id]
        if len(r_al) == med_len:
            median_read_unit = r_al
            template_read = r_id
            break
    read_outdir = os.path.join(outdir, seq_id[:8])
    smart_makedirs(read_outdir)
    splits_outfile = os.path.join(read_outdir, 'splits.fasta')
    median_read_unit_fn = os.path.join(read_outdir, 'median_read_unit.fasta')
    write_bio_seqs(splits_outfile, splits)
    write_bio_seqs(median_read_unit_fn, {template_read: median_read_unit})

    print("Running Flye")
    cmd = [
        'flye', f'--nano-raw', splits_outfile, '--polish-target',
        median_read_unit_fn, '-i', 2, '-t', 50, '-o', read_outdir
    ]
    cmd = [str(x) for x in cmd]
    subprocess.check_call(cmd)

    plt.hist(union_conv, bins=100)
    plt.title(f'Tandem read convolution, {seq_id[:8]}, period={periods[0]}')
    plt.savefig(os.path.join(read_outdir, f'{seq_id[:8]}.pdf'), format='pdf')
    plt.close()
示例#7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ncrf", help="Input NCRF", required=True)
    parser.add_argument("--seq", help="Input sequence", required=True)
    parser.add_argument("--buf",
                        help="Buffer on the sides to include",
                        type=int,
                        default=20)
    parser.add_argument("--outdir", help="Output dir", required=True)
    params = parser.parse_args()

    smart_makedirs(params.outdir)
    ncrf_report = NCRF_Report(params.ncrf)
    input_seq = read_bio_seq(params.seq)
    all_mas = ncrf_report.get_motif_alignments()
    for seq_id, mas in all_mas.items():
        record = ncrf_report.records[seq_id]
        units = {}
        coords = {}
        al_start = record.r_st
        alignment = record.r_al.replace('-', '')
        start = 0
        for ma in mas:
            ma_st = ma.start
            ma_en = ma.end
            seq_al = record.r_al[ma_st:ma_en]
            seq = seq_al.replace('-', '')
            end = start + len(seq)
            seq_st = input_seq[al_start + start - params.buf:al_start + start]
            seq_en = input_seq[al_start + end:end + al_start + params.buf]
            seq = seq_st + seq + seq_en
            ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}'
            units[ma_id] = seq
            coords[ma_id] = (start + al_start, end + al_start)
            # print(input_seq[start+al_start:end+al_start])
            # print(seq[params.buf:-params.buf])
            assert input_seq[start + al_start - len(seq_st):end + al_start +
                             len(seq_en)] == seq
            start = end
        outfile = os.path.join(params.outdir, f'{seq_id}.fasta')
        write_bio_seqs(outfile, units)
示例#8
0
def output_results(tr, left_flanked_tr, flanked_tr, all_muts, output_dir):
    smart_makedirs(output_dir)
    write_bio_seqs(os.path.join(output_dir, 'tandem_repeat.fasta'),
                   {'sim_tr': tr})
    write_bio_seqs(os.path.join(output_dir,
                                'left_flanked_tandem_repeat.fasta'),
                   {'left_flanked_sim_tr': left_flanked_tr})
    write_bio_seqs(os.path.join(output_dir, 'flanked_tandem_repeat.fasta'),
                   {'flanked_sim_tr': flanked_tr})
    with open(os.path.join(output_dir, 'all_muts.json'), 'w') as f:
        all_muts = dict(all_muts)
        all_muts = stringify_keys(all_muts)
        print(json.dumps(all_muts), file=f)
    with open(os.path.join(output_dir, 'simulation.log'), 'w') as f:
        total_n_mut = sum(len(x) for x in all_muts.values())
        print(f'full_tr_len = {len(tr)}', file=f)
        print(f'total_n_mut = {total_n_mut}', file=f)
        for pos, muts in all_muts.items():
            print(f'{pos} : {len(muts)}', file=f)
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--reads",
                        help="Path to centromeric reads in fasta format",
                        required=True)
    parser.add_argument("--repeat",
                        help="Path to the unit sequence",
                        required=True)
    parser.add_argument("-t",
                        "--threads",
                        help="Number of threads",
                        type=int,
                        default=30)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("--ncrf-bin",
                        help="Path to binary of NCRF",
                        default='NCRF')
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    repeat = read_bio_seq(params.repeat)

    reads = read_bio_seqs(params.reads)
    reads_split = chunks2(list(reads.keys()), params.threads)
    reads_chunks_fn = {}
    for i in range(len(reads_split)):
        reads_chunk = {k: reads[k] for k in reads_split[i]}
        outdir = os.path.join(params.outdir, 'split_reads')
        smart_makedirs(outdir)
        reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta')
        reads_chunks_fn[i] = reads_fn
        write_bio_seqs(reads_fn, reads_chunk)

    ps = []
    ncrf_reports_fn = []
    for i, fn in reads_chunks_fn.items():
        outdir = os.path.join(params.outdir, 'ncrf_report')
        smart_makedirs(outdir)
        ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf')
        with open(ncrf_report_fn, 'w') as f:
            p1 = Popen(['cat', fn], stdout=PIPE)
            p2 = Popen([params.ncrf_bin, f'unit:{repeat}'],
                       stdin=p1.stdout,
                       stdout=f)
            ps.append(p2)
        ncrf_reports_fn.append(ncrf_report_fn)
    for p in ps:
        p.wait()

    final_report_fn = os.path.join(params.outdir, 'report.ncrf')
    with open(final_report_fn, 'w') as f:
        cmd1 = ['cat'] + ncrf_reports_fn
        p1 = Popen(cmd1, stdout=PIPE)
        cmd2 = f"grep -v -E end-of-file".split(' ')
        p2 = Popen(cmd2, stdin=p1.stdout, stdout=f)
        p2.wait()

    cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}'
    call(cmd.split(' '))
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--dbg", required=True,
                        help="Directory with DBG output")
    parser.add_argument("-o", "--outdir", required=True)
    parser.add_argument("--ref")
    parser.add_argument("--refhpc", action='store_true')
    parser.add_argument("--no_export_pdf", action='store_true')
    parser.add_argument("-K", type=int, default=40002)
    params = parser.parse_args()

    params.dbg = expandpath(params.dbg)
    params.outdir = expandpath(params.outdir)
    smart_makedirs(params.outdir)
    logfn = os.path.join(params.outdir, 'inc_k.log')
    global logger
    logger = get_logger(logfn,
                        logger_name='centroFlye: inc_k')
    logger.info(f'cmd: {sys.argv}')
    logger.info(f'git hash: {get_git_revision_short_hash()}')

    db_fn = os.path.join(params.dbg, 'graph.fasta')
    align_fn = os.path.join(params.dbg, 'alignments.txt')
    dbg_log_fn = os.path.join(params.dbg, 'dbg.log')
    with open(dbg_log_fn) as f:
        cmd = f.readline().strip().split(' ')
        i = 0
        while cmd[i] != '-k':
            i += 1
        k = int(cmd[i+1]) + 1
    logger.info(f'init k = {k}')
    logger.info(f'Reading DBG output from {params.dbg}')
    lpdb = PathMultiKGraph.fromDR(db_fn=db_fn, align_fn=align_fn,
                                  k=k, K=params.K)
    logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}')
    logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}')
    logger.info(f'Finished reading DBG output')
    logger.info(f'Starting increasing k')
    lpdb.transform_fast_until_saturated()
    logger.info(f'Finished increasing k')
    logger.info(f'# vertices = {nx.number_of_nodes(lpdb.nx_graph)}')
    logger.info(f'# edges = {nx.number_of_edges(lpdb.nx_graph)}')

    outac = os.path.join(params.outdir, f'active_connections.txt')
    logger.info(f'Active connections output to {outac}')
    with open(outac, 'w') as f:
        ac = lpdb.idb_mappings.get_active_connections()
        ac = sorted(list(ac))
        for i, j in ac:
            print(f'{i} {j}', file=f)

    outuniquedges = os.path.join(params.outdir, f'unique_edges.txt')
    logger.info(f'Unique edges output to {outuniquedges}')
    with open(outuniquedges, 'w') as f:
        for index in sorted(list(lpdb.unique_edges)):
            print(index, file=f)

    outdot = os.path.join(params.outdir, f'dbg_{k}-{lpdb.init_k+lpdb.niter}')
    logger.info(f'Writing final graph to {outdot}')

    outfasta = outdot + '.fasta'
    logger.info(f'Writing graph edges to {outfasta}')
    edges = {key: ''.join(edge) for key, edge in lpdb.edge2seq.items()}
    write_bio_seqs(outfasta, edges)

    lpdb.write_dot(params.outdir, compact=True,
                   reffn=params.ref, refhpc=params.refhpc, export_pdf=not params.no_export_pdf)
    logger.info(f'Finished writing final graph (dot)')
    out = open(outdot + ".graph", "w")
    for edge in lpdb.nx_graph.edges(keys=True):
        index = lpdb.edge2index[edge]
        seq = lpdb.edge2seq[index]
        out.write(">" + "_".join([str(index), str(edge[0]), str(lpdb.node2len[edge[0]]), str(edge[1]), str(lpdb.node2len[edge[1]])]) + "\n")
        out.write("".join(seq))
        out.write("\n")
    out.close()
示例#11
0
def polish(scaffolds,
           pseudounits,
           read_pseudounits,
           reads,
           monomers,
           outdir,
           n_iter,
           n_threads,
           flye_bin='flye'):
    def get_template(scaffold, st, en):
        return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1])

    monomers = {
        m_id[0]: monomer
        for m_id, monomer in monomers.items() if m_id[-1] != "'"
    }
    smart_makedirs(outdir)
    for i, (scaffold,
            scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)):
        scaf_outdir = os.path.join(outdir, f'scaffold_{i}')
        smart_makedirs(scaf_outdir)

        polished_scaffold = []
        for j, (s_st, s_en) in enumerate(scaf_pseudounits):
            pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}')
            smart_makedirs(pseudounit_outdir)

            # template = get_template(scaffold, s_st, s_en)
            # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}'
            # write_bio_seqs(template_fn, {template_id: template})

            pseudounit_reads = {}
            for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items():
                read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}'
                pseudounit_read = reads[r_id[0]][r_st:r_en + 1]
                if strand == '-':
                    pseudounit_read = RC(pseudounit_read)
                pseudounit_reads[read_segm_id] = pseudounit_read
            reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta')
            write_bio_seqs(reads_fn, pseudounit_reads)

            template_fn = os.path.join(pseudounit_outdir, 'template.fasta')
            template_id, template_read = "", None
            r_units_lens = [len(read) for read in pseudounit_reads.values()]
            med_len = statistics.median_high(r_units_lens)
            for r_id in sorted(pseudounit_reads.keys()):
                read = pseudounit_reads[r_id]
                if len(read) == med_len:
                    template_id = r_id
                    template_read = read
                    break
            assert len(pseudounit_reads[template_id]) == med_len
            assert len(template_read) == med_len
            write_bio_seqs(template_fn, {template_id: template_read})

            cmd = [
                flye_bin, '--nano-raw', reads_fn, '--polish-target',
                template_fn, '-i', n_iter, '-t', n_threads, '-o',
                pseudounit_outdir
            ]
            cmd = [str(x) for x in cmd]
            print(' '.join(cmd))
            subprocess.check_call(cmd)

            try:
                polished_pseudounit_fn = \
                    os.path.join(pseudounit_outdir,
                                 f'polished_{n_iter}.fasta')
                polished_pseudounit = read_bio_seq(polished_pseudounit_fn)
                polished_scaffold.append(polished_pseudounit)
            except FileNotFoundError:
                polished_scaffold.append(template)

        polished_scaffold = ''.join(polished_scaffold)
        polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta')
        write_bio_seqs(polished_scaffold_fn,
                       {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})