예제 #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--unit",
                        help="Unit of tandem repeat. Default: random")
    parser.add_argument("--unit-len",
                        help="Unit length used in case no unit is provided",
                        type=int,
                        default=200)
    parser.add_argument("--multiplicity",
                        help="Multiplicity of the repeat to generate",
                        required=True,
                        type=int)
    parser.add_argument("--div-rate",
                        help="Average divergence rate between blocks",
                        type=float,
                        required=True)
    parser.add_argument("-o", "--output",
                        help="Output directory",
                        required=True)
    parser.add_argument("--seed", help="Seed", type=int)
    params = parser.parse_args()

    if params.seed is not None:
        np.random.seed(params.seed)

    if params.unit is None:
        unit = gen_random_seq(length=params.unit_len)
    else:
        unit = read_bio_seq(params.unit)

    tr, left_flanked_tr, flanked_tr, all_muts = generate_mutations(unit, params.multiplicity, params.div_rate)
    output_results(tr, left_flanked_tr, flanked_tr, all_muts, params.output)
예제 #2
0
def get_units(input_dir):
    units = {}
    for f in os.scandir(input_dir):
        if f.is_dir():
            polished_fn = os.path.join(f.path, 'polished_2.fasta')
            unit = read_bio_seq(polished_fn)
            units[os.path.basename(f.path)] = unit
    return units
예제 #3
0
    def write_dot(self, outdir, reffn=None, refhpc=False,
                  compact=False, export_pdf=True):
        if reffn is not None:
            # TODO make a parameter
            exact_matcher_bin = '/Poppy/abzikadze/DR/bin/exact_matcher'
            ref = read_bio_seq(reffn)
            if refhpc:
                ref = compress_homopolymer(ref)
            reffn_outfn = os.path.join(outdir, 'ref.fasta')
            write_bio_seqs(reffn_outfn, {'ref': ref})
            exact_matcher_outfn = os.path.join(outdir, 'edge_matching.tsv')
            edges_fn = os.path.join(
                outdir, f'dbg_{self.init_k}-{self.init_k+self.niter}.fasta')
            exact_matcher_cmd = \
                f'{exact_matcher_bin} --output {exact_matcher_outfn} ' \
                f'--reference {reffn_outfn} --query {edges_fn}'
            logger.info(f'Running exact matcher. Cmd: {exact_matcher_cmd}')
            exact_matcher_cmd = exact_matcher_cmd.split(' ')

            subprocess.call(exact_matcher_cmd)

            mult = defaultdict(lambda: [0, 0])
            with open(exact_matcher_outfn) as f:
                f.readline()
                for line in f:
                    line = line.strip().split('\t')
                    _, index, pos, strand = line
                    index, pos = int(index), int(pos)
                    strand = strand != '+'  # strand == '-' => 0
                    mult[index][strand] += 1

        outfile = os.path.join(outdir,
                               f'dbg_{self.init_k}-{self.init_k+self.niter}')
        graph = nx.MultiDiGraph()
        for node in self.nx_graph.nodes():
            graph.add_node(node, label=f'{node} len={self.node2len[node]}')
        for edge in self.nx_graph.edges(keys=True):
            index = self.edge2index[edge]
            seq = self.edge2seq[index] if not compact else None
            seqlen = len(self.edge2seq[index])
            label = f'index={index}\nlen={seqlen}'
            if reffn is not None:
                # print(mult[index], mult_est[index])
                # assert mult[index] == 0 or mult[index] >= mult_est[index]
                if mult[index] == [0, 0]:
                    logger.info(f'Warning: edge {index} has [0, 0] coverage')
                label += f'\nmult_real={mult[index]}'
            graph.add_edge(*edge,
                           label=label,
                           seq=seq)
        dotfile = f'{outfile}.dot'
        nx.drawing.nx_pydot.write_dot(graph, dotfile)
        if export_pdf and self.nx_graph.size() < 500:
            pdffile = f'{outfile}.pdf'
            # https://stackoverflow.com/a/3516106
            cmd = ['dot', '-Tpdf', dotfile, '-o', pdffile]
            call(cmd)
예제 #4
0
 def read_polishing(self, read_unit_filenames):
     min_pos = min(read_unit_filenames.keys())
     max_pos = max(read_unit_filenames.keys())
     polished_seqs = {}
     final_sequences = {}
     for i in range(1, self.params.num_iters + 1):
         for pos, (units_fn, longest_read_unit_fn) in read_unit_filenames.items():
             pos_dir = os.path.dirname(units_fn)
             polished_seq_fn = os.path.join(pos_dir, f'polished_{i}.fasta')
             polished_seq = read_bio_seq(polished_seq_fn)
             polished_seqs[pos] = polished_seq
         final_sequence = \
             [polished_seqs[pos] for pos in range(min_pos, max_pos + 1)]
         final_sequence = ''.join(final_sequence)
         final_sequences[i] = final_sequence
     return final_sequences
예제 #5
0
 def __init__(self, params):
     self.params = params
     if not os.path.isfile(params.unit):
         raise FileNotFoundError(f"File {params.unit} is not found")
     self.unit = read_bio_seq(params.unit)
     self.ncrf_report = NCRF_Report(params.ncrf)
     self.motif_alignments = self.ncrf_report.get_motif_alignments()
     smart_makedirs(params.outdir)
     self.read_placement = read_reported_positions(params.read_placement)
     self.max_pos = self.params.max_pos
     self.min_pos = self.params.min_pos
     if self.max_pos == math.inf:
         self.max_pos = 0
         for r_id, pos in self.read_placement.items():
             if pos is None:
                 continue
             ma = self.motif_alignments[r_id]
             self.max_pos = max(self.max_pos, pos + len(ma))
def main():
    params = parse_args()
    outdir = os.path.dirname(params.output)
    smart_makedirs(outdir)

    reads_ncrf_report = NCRF_Report(params.reads_ncrf)
    unit_seq = read_bio_seq(params.unit)

    kmer_counts_reads, most_frequent_kmers = \
        get_most_frequent_kmers(reads_ncrf_report,
                                k=params.k,
                                unit_seq=unit_seq)

    new_unit = get_polished_unit(k=params.k,
                                 most_frequent_kmers=most_frequent_kmers,
                                 kmer_counts_reads=kmer_counts_reads,
                                 unit_seq=unit_seq)

    write_bio_seqs(params.output, {'DXZ1*': new_unit})
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ncrf", help="Input NCRF", required=True)
    parser.add_argument("--seq", help="Input sequence", required=True)
    parser.add_argument("--buf",
                        help="Buffer on the sides to include",
                        type=int,
                        default=20)
    parser.add_argument("--outdir", help="Output dir", required=True)
    params = parser.parse_args()

    smart_makedirs(params.outdir)
    ncrf_report = NCRF_Report(params.ncrf)
    input_seq = read_bio_seq(params.seq)
    all_mas = ncrf_report.get_motif_alignments()
    for seq_id, mas in all_mas.items():
        record = ncrf_report.records[seq_id]
        units = {}
        coords = {}
        al_start = record.r_st
        alignment = record.r_al.replace('-', '')
        start = 0
        for ma in mas:
            ma_st = ma.start
            ma_en = ma.end
            seq_al = record.r_al[ma_st:ma_en]
            seq = seq_al.replace('-', '')
            end = start + len(seq)
            seq_st = input_seq[al_start + start - params.buf:al_start + start]
            seq_en = input_seq[al_start + end:end + al_start + params.buf]
            seq = seq_st + seq + seq_en
            ma_id = f'{seq_id}|st_{start + al_start}|en_{end - 1 + al_start}'
            units[ma_id] = seq
            coords[ma_id] = (start + al_start, end + al_start)
            # print(input_seq[start+al_start:end+al_start])
            # print(seq[params.buf:-params.buf])
            assert input_seq[start + al_start - len(seq_st):end + al_start +
                             len(seq_en)] == seq
            start = end
        outfile = os.path.join(params.outdir, f'{seq_id}.fasta')
        write_bio_seqs(outfile, units)
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--reads",
                        help="Path to centromeric reads in fasta format",
                        required=True)
    parser.add_argument("--repeat",
                        help="Path to the unit sequence",
                        required=True)
    parser.add_argument("-t",
                        "--threads",
                        help="Number of threads",
                        type=int,
                        default=30)
    parser.add_argument("-o",
                        "--outdir",
                        help="Output directory",
                        required=True)
    parser.add_argument("--ncrf-bin",
                        help="Path to binary of NCRF",
                        default='NCRF')
    params = parser.parse_args()
    smart_makedirs(params.outdir)

    repeat = read_bio_seq(params.repeat)

    reads = read_bio_seqs(params.reads)
    reads_split = chunks2(list(reads.keys()), params.threads)
    reads_chunks_fn = {}
    for i in range(len(reads_split)):
        reads_chunk = {k: reads[k] for k in reads_split[i]}
        outdir = os.path.join(params.outdir, 'split_reads')
        smart_makedirs(outdir)
        reads_fn = os.path.join(outdir, f'split_reads_{i}.fasta')
        reads_chunks_fn[i] = reads_fn
        write_bio_seqs(reads_fn, reads_chunk)

    ps = []
    ncrf_reports_fn = []
    for i, fn in reads_chunks_fn.items():
        outdir = os.path.join(params.outdir, 'ncrf_report')
        smart_makedirs(outdir)
        ncrf_report_fn = os.path.join(outdir, f'report_{i}.ncrf')
        with open(ncrf_report_fn, 'w') as f:
            p1 = Popen(['cat', fn], stdout=PIPE)
            p2 = Popen([params.ncrf_bin, f'unit:{repeat}'],
                       stdin=p1.stdout,
                       stdout=f)
            ps.append(p2)
        ncrf_reports_fn.append(ncrf_report_fn)
    for p in ps:
        p.wait()

    final_report_fn = os.path.join(params.outdir, 'report.ncrf')
    with open(final_report_fn, 'w') as f:
        cmd1 = ['cat'] + ncrf_reports_fn
        p1 = Popen(cmd1, stdout=PIPE)
        cmd2 = f"grep -v -E end-of-file".split(' ')
        p2 = Popen(cmd2, stdin=p1.stdout, stdout=f)
        p2.wait()

    cmd = f'sed -i s/unit/{repeat}/g {final_report_fn}'
    call(cmd.split(' '))
예제 #9
0
def polish(scaffolds,
           pseudounits,
           read_pseudounits,
           reads,
           monomers,
           outdir,
           n_iter,
           n_threads,
           flye_bin='flye'):
    def get_template(scaffold, st, en):
        return ''.join(monomers[m_id] for m_id in scaffold[st:en + 1])

    monomers = {
        m_id[0]: monomer
        for m_id, monomer in monomers.items() if m_id[-1] != "'"
    }
    smart_makedirs(outdir)
    for i, (scaffold,
            scaf_pseudounits) in enumerate(zip(scaffolds, pseudounits)):
        scaf_outdir = os.path.join(outdir, f'scaffold_{i}')
        smart_makedirs(scaf_outdir)

        polished_scaffold = []
        for j, (s_st, s_en) in enumerate(scaf_pseudounits):
            pseudounit_outdir = os.path.join(scaf_outdir, f'pseudounit_{j}')
            smart_makedirs(pseudounit_outdir)

            # template = get_template(scaffold, s_st, s_en)
            # template_id = f'scaffold_{i}_template_{j}_{scaffold[s_st:s_en+1]}'
            # write_bio_seqs(template_fn, {template_id: template})

            pseudounit_reads = {}
            for r_id, (r_st, r_en, strand) in read_pseudounits[i][j].items():
                read_segm_id = f's_{i}_t_{j}_{r_id[0]}_{r_st}_{r_en+1}'
                pseudounit_read = reads[r_id[0]][r_st:r_en + 1]
                if strand == '-':
                    pseudounit_read = RC(pseudounit_read)
                pseudounit_reads[read_segm_id] = pseudounit_read
            reads_fn = os.path.join(pseudounit_outdir, 'reads.fasta')
            write_bio_seqs(reads_fn, pseudounit_reads)

            template_fn = os.path.join(pseudounit_outdir, 'template.fasta')
            template_id, template_read = "", None
            r_units_lens = [len(read) for read in pseudounit_reads.values()]
            med_len = statistics.median_high(r_units_lens)
            for r_id in sorted(pseudounit_reads.keys()):
                read = pseudounit_reads[r_id]
                if len(read) == med_len:
                    template_id = r_id
                    template_read = read
                    break
            assert len(pseudounit_reads[template_id]) == med_len
            assert len(template_read) == med_len
            write_bio_seqs(template_fn, {template_id: template_read})

            cmd = [
                flye_bin, '--nano-raw', reads_fn, '--polish-target',
                template_fn, '-i', n_iter, '-t', n_threads, '-o',
                pseudounit_outdir
            ]
            cmd = [str(x) for x in cmd]
            print(' '.join(cmd))
            subprocess.check_call(cmd)

            try:
                polished_pseudounit_fn = \
                    os.path.join(pseudounit_outdir,
                                 f'polished_{n_iter}.fasta')
                polished_pseudounit = read_bio_seq(polished_pseudounit_fn)
                polished_scaffold.append(polished_pseudounit)
            except FileNotFoundError:
                polished_scaffold.append(template)

        polished_scaffold = ''.join(polished_scaffold)
        polished_scaffold_fn = os.path.join(scaf_outdir, f'scaffold_{i}.fasta')
        write_bio_seqs(polished_scaffold_fn,
                       {f'scaffold_{i}_niter_{n_iter}': polished_scaffold})