Пример #1
0
def main():
    args = get_parser().parse_args()

    megalodon.mkdir(args.output_megalodon_results_dir, False)
    out_vars_db = variants.VarsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME),
        read_only=False,
        loc_index_in_memory=not args.var_locations_on_disk)

    for mega_dir in args.megalodon_results_dirs:
        # full read only mode with no indices read into memory
        vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir,
                                                      mh.PR_VAR_NAME),
                                  read_only=True,
                                  chrm_index_in_memory=False,
                                  alt_index_in_memory=False,
                                  uuid_index_in_memory=False)
        for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end,
             test_start, chrm, chrm_len) in vars_db.iter_data():
            chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len)
            loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start,
                                                      test_end, pos, ref_seq,
                                                      var_name)
            alt_id = out_vars_db.get_alt_id_or_insert(alt_seq)
            read_id = out_vars_db.get_read_id_or_insert(uuid)
            out_vars_db.insert_data(score, loc_id, alt_id, read_id)

    if out_vars_db.chrm_idx_in_mem:
        out_vars_db.create_chrm_index()
    if out_vars_db.loc_idx_in_mem:
        out_vars_db.create_loc_index()
    if out_vars_db.alt_idx_in_mem:
        out_vars_db.create_alt_index()
    out_vars_db.create_data_covering_index()
    out_vars_db.close()
Пример #2
0
def main():
    args = get_parser().parse_args()

    vars_db = variants.VarsDb(mh.get_megalodon_fn(args.megalodon_results_dir,
                                                  mh.PR_VAR_NAME),
                              uuid_strand_index_in_memory=True)
    vars_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME)
        if args.out_filename is None else args.out_filename, 'w')
    vars_txt_fp.write('\t'.join(vars_db.text_field_names) + '\n')
    for (loc_id, loc_chrm, pos, ref_seq, var_name,
         has_context_base) in tqdm(vars_db.iter_locs(),
                                   total=vars_db.get_num_uniq_var_loc(),
                                   smoothing=0):
        pr_var_stats = vars_db.get_loc_stats(
            (loc_id, loc_chrm, pos, ref_seq, var_name))
        alt_type_stats = defaultdict(dict)
        for r_stats in pr_var_stats:
            alt_type_stats[r_stats.read_id][r_stats.alt_seq] = (r_stats.score,
                                                                r_stats.chrm)

        var_out_text = ''
        for read_id, r_var_stats in alt_type_stats.items():
            uuid, strand = vars_db.get_uuid_strand(read_id)
            alt_lps = np.array(list(zip(*r_var_stats.values()))[0])
            with np.errstate(divide='ignore'):
                ref_lp = np.log1p(-np.exp(alt_lps).sum())
            var_out_text += '\n'.join(
                (('\t'.join('{}' for _ in vars_db.text_field_names)).format(
                    uuid, chrm, strand, pos, ref_lp, alt_lp, ref_seq, alt_seq,
                    var_name)
                 for alt_seq, (alt_lp, chrm) in r_var_stats.items())) + '\n'
        vars_txt_fp.write(var_out_text)

    return
Пример #3
0
def _main(args):
    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME))
    mods_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME)
        if args.out_filename is None else args.out_filename, 'w')
    mods_txt_fp.write('\t'.join(mods_db.text_field_names) + '\n')
    for pos_id, pos_chrm, strand, pos in tqdm(
            mods_db.iter_pos(), total=mods_db.get_num_uniq_mod_pos(),
            smoothing=0):
        pr_mod_stats = mods_db.get_pos_stats(
            (pos_id, pos_chrm, strand, pos), return_uuids=True)
        mod_type_stats = defaultdict(dict)
        for r_stats in pr_mod_stats:
            mod_type_stats[r_stats.read_id][r_stats.mod_base] = (
                r_stats.score, r_stats.raw_motif, r_stats.motif_pos,
                r_stats.chrm)

        mod_out_text = ''
        for read_id, r_mod_stats in mod_type_stats.items():
            mod_lps = np.array(list(zip(*r_mod_stats.values()))[0])
            with np.errstate(divide='ignore'):
                can_lp = np.log1p(-np.exp(mod_lps).sum())
            mod_out_text += '\n'.join((
                ('\t'.join('{}' for _ in mods_db.text_field_names)).format(
                    read_id, chrm, strand, pos, mod_lp,
                    can_lp, mod_base, '{}:{}'.format(raw_motif, motif_pos))
                for mod_base, (mod_lp, raw_motif, motif_pos, chrm) in
                r_mod_stats.items())) + '\n'
        mods_txt_fp.write(mod_out_text)
def main():
    args = get_parser().parse_args()

    megalodon.mkdir(args.output_megalodon_results_dir, False)
    out_mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME),
        read_only=False,
        pos_index_in_memory=not args.mod_positions_on_disk)

    for mega_dir in args.megalodon_results_dirs:
        # full read only mode with no indices read into memory
        mods_db = mods.ModsDb(mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME),
                              read_only=True,
                              chrm_index_in_memory=False,
                              mod_index_in_memory=False,
                              uuid_index_in_memory=False)
        for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos,
             chrm, chrm_len) in mods_db.iter_data():
            chrm_id = out_mods_db.get_chrm_id_or_insert(chrm, chrm_len)
            pos_id = out_mods_db.get_pos_id_or_insert(chrm_id, strand, pos)
            mod_base_id = out_mods_db.get_mod_base_id_or_insert(
                mod_base, motif, motif_pos, raw_motif)
            read_id = out_mods_db.get_read_id_or_insert(uuid)
            out_mods_db.insert_data(score, pos_id, mod_base_id, read_id)

    if out_mods_db.chrm_idx_in_mem:
        out_mods_db.create_chrm_index()
    if out_mods_db.pos_idx_in_mem:
        out_mods_db.create_pos_index()
    if out_mods_db.mod_idx_in_mem:
        out_mods_db.create_mod_index()
    out_mods_db.create_data_covering_index()
    out_mods_db.close()
def _main(args):
    logging.init_logger(quiet=args.quiet)

    if (args.ground_truth_data is None
            and args.control_megalodon_results_dir is None):
        LOGGER.error(
            "Must provide either --control-megalodon-results-dir or " +
            "--ground-truth-data")
        sys.exit()

    db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
    if args.ground_truth_data is not None:
        LOGGER.info("Parsing ground truth data")
        gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file(
            args.ground_truth_data, include_strand=args.strand_specific_sites)
        LOGGER.info(
            ("Loaded ground truth data with {} modified sites and {} " +
             "canonical sites.").format(len(gt_mod_pos), len(gt_can_pos)))
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "database.")
        all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites(
            db_fn,
            [gt_mod_pos, gt_can_pos],
            include_strand=args.strand_specific_sites,
        )
    else:
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "database")
        all_mod_llrs = mods.extract_all_stats(db_fn)
        LOGGER.info("Reading ground truth modified base statistics from " +
                    "canonical sample database")
        all_can_llrs = mods.extract_all_stats(
            mh.get_megalodon_fn(args.control_megalodon_results_dir,
                                mh.PR_MOD_NAME))

    mod_summary = [(
        mod,
        len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
        len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
    ) for mod in set(all_mod_llrs).union(all_can_llrs)]
    LOGGER.info("Data summary:\n\tmod\tmod_N\tcan_N\n" +
                "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary))
    output_mods_data(
        all_mod_llrs,
        all_can_llrs,
        args.modified_bases_set,
        args.exclude_modified_bases,
        args.out_filename,
    )
Пример #6
0
def _get_bc_queue(bc_q, bc_conn, out_dir, bc_fmt, do_output_mods,
                  mod_long_names):
    bc_fp = open(mh.get_megalodon_fn(out_dir, mh.BC_NAME) + '.' + bc_fmt, 'w')
    if do_output_mods:
        mods_fp = h5py.File(mh.get_megalodon_fn(out_dir, mh.BC_MODS_NAME))
        mods_fp.create_group('Reads')
        mods_fp.create_dataset('mod_long_names',
                               data=np.array(mod_long_names, dtype='S'),
                               dtype=h5py.special_dtype(vlen=str))

    while True:
        try:
            # TODO add quality output to add fastq option
            read_id, r_seq, mods_scores = bc_q.get(block=False)
            bc_fp.write('>{}\n{}\n'.format(read_id, r_seq))
            bc_fp.flush()
            if do_output_mods:
                try:
                    mods_fp.create_dataset('Reads/' + read_id,
                                           data=mods_scores,
                                           compression="gzip")
                except RuntimeError:
                    # same read_id encountered previously
                    pass
        except queue.Empty:
            if bc_conn.poll():
                break
            sleep(0.1)
            continue

    while not bc_q.empty():
        read_id, r_seq, mods_scores = bc_q.get(block=False)
        bc_fp.write('>{}\n{}\n'.format(read_id, r_seq))
        bc_fp.flush()
        if do_output_mods:
            try:
                mods_fp.create_dataset('Reads/' + read_id,
                                       data=mods_scores,
                                       compression="gzip")
            except RuntimeError:
                # same read_id encountered previously
                pass

    bc_fp.close()
    if do_output_mods:
        mods_fp.close()

    return
Пример #7
0
def _main(args):
    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME),
        in_mem_dbid_to_uuid=True,
    )
    mods_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME)
        if args.out_filename is None else args.out_filename,
        "w",
    )
    mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n")
    rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n"
    bar = tqdm(
        desc="Processing Per-read Data",
        unit="per-read calls",
        total=mods_db.get_num_uniq_stats(),
        smoothing=0,
        dynamic_ncols=True,
    )
    for (chrm, strand,
         pos), pos_lps in mods_db.iter_pos_scores(convert_pos=True):
        bar.update(len(pos_lps))
        str_strand = mh.int_strand_to_str(strand)
        mod_out_text = ""
        prev_dbid = None
        mod_bs, r_lps = [], []
        for read_dbid, mod_dbid, lp in sorted(pos_lps):
            if prev_dbid != read_dbid and prev_dbid is not None:
                uuid = mods_db.get_uuid(prev_dbid)
                # compute and store log likelihood ratios
                with np.errstate(divide="ignore"):
                    can_lp = np.log1p(-np.exp(r_lps).sum())
                for mod_b, r_lp in zip(mod_bs, r_lps):
                    mod_out_text += rec_tmplt.format(uuid, chrm, str_strand,
                                                     pos, r_lp, can_lp, mod_b)
                mod_bs, r_lps = [], []
            prev_dbid = read_dbid
            mod_bs.append(mods_db.get_mod_base(mod_dbid))
            r_lps.append(lp)
        uuid = mods_db.get_uuid(prev_dbid)
        # compute and store log likelihood ratios
        with np.errstate(divide="ignore"):
            can_lp = np.log1p(-np.exp(r_lps).sum())
        for mod_b, r_lp in zip(mod_bs, r_lps):
            mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp,
                                             can_lp, mod_b)
        mods_txt_fp.write(mod_out_text)
    mods_txt_fp.close()
Пример #8
0
def _get_snp_stats_queue(snp_stats_q, snp_conn, out_dir, ref_names_and_lens,
                         out_suffix, write_vcf_lp):
    agg_snp_fn = mh.get_megalodon_fn(out_dir, mh.SNP_NAME)
    if out_suffix is not None:
        base_fn, fn_ext = os.path.splitext(agg_snp_fn)
        agg_snp_fn = base_fn + '.' + out_suffix + fn_ext
    agg_snp_fp = snps.VcfWriter(agg_snp_fn,
                                'w',
                                ref_names_and_lens=ref_names_and_lens,
                                write_vcf_lp=write_vcf_lp)

    while True:
        try:
            snp_var = snp_stats_q.get(block=False)
            if snp_var is None: continue
            agg_snp_fp.write_variant(snp_var)
        except queue.Empty:
            if snp_conn.poll():
                break
            sleep(0.001)
            continue

    while not snp_stats_q.empty():
        snp_var = snp_stats_q.get(block=False)
        agg_snp_fp.write_variant(snp_var)

    agg_snp_fp.close()

    return
Пример #9
0
def report_acc_metrics(res_dir, out_fp):
    try:
        bc_dat = pd.read_csv(mh.get_megalodon_fn(res_dir, mh.MAP_SUMM_NAME),
                             sep='\t')
        bc_acc = bc_dat['pct_identity']
        parsim_acc = 100 * (bc_dat['num_match'] - bc_dat['num_ins']) / \
                     (bc_dat['num_align'] - bc_dat['num_ins'])
        mean_bc_acc = np.mean(bc_acc)
        med_bc_acc = np.median(bc_acc)
        # crude mode by rounding to 1 decimal
        uniq_acc, acc_counts = np.unique(np.around(bc_acc, 1),
                                         return_counts=True)
        mode_bc_acc = uniq_acc[np.argmax(acc_counts)]
        out_fp.write(
            ('Mapping metrics for {} ({} mapped reads):\n\t' +
             'Mean Pct. Identity:    {:.4f}\n\t' +
             'Median Pct. Identity:  {:.4f}\n\t' +
             'Mode Pct. Identity:    {:.1f}\n').format(res_dir,
                                                       bc_dat.shape[0],
                                                       mean_bc_acc, med_bc_acc,
                                                       mode_bc_acc))
    except FileNotFoundError:
        bc_acc = parsim_acc = None
        if VERBOSE:
            sys.stderr.write(
                'WARNING: Mappings not found for {}\n'.format(res_dir))

    return bc_acc, parsim_acc
Пример #10
0
def _main(args):
    log_suffix = ('aggregation' if args.output_suffix is None else
                  'aggregation.' + args.output_suffix)
    logging.init_logger(args.megalodon_directory, out_suffix=log_suffix)
    LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""')

    if args.mod_aggregate_method == mh.MOD_EM_NAME:
        mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None)
    elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME:
        mod_agg_info = mods.AGG_INFO(
            mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold)
    valid_read_ids = mh.parse_read_ids(args.read_ids_filename)
    aggregate.aggregate_stats(
        args.outputs, args.megalodon_directory, args.processes,
        args.write_vcf_log_probs, args.heterozygous_factors,
        variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE,
        mod_agg_info, args.write_mod_log_probs, args.mod_output_formats,
        args.suppress_progress, valid_read_ids, args.output_suffix,
        args.aggregate_batch_size)

    if mh.VAR_NAME in args.outputs:
        LOGGER.info('Sorting output variant file')
        variant_fn = mh.add_fn_suffix(
            mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME),
            args.output_suffix)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        variants.sort_variants(variant_fn, sort_variant_fn)
        LOGGER.info('Indexing output variant file')
        variants.index_variants(sort_variant_fn)
Пример #11
0
def report_acc_metrics(res_dir, out_fp, samp_lab):
    try:
        bc_data = mapping.parse_map_summary_file(
            mh.get_megalodon_fn(res_dir, mh.MAP_SUMM_NAME))
        bc_acc = np.array([r_data.pct_identity for r_data in bc_data])
        parsim_acc = np.array([
            100 * (r_data.num_match - r_data.num_ins) /
            (r_data.num_align - r_data.num_ins) for r_data in bc_data
        ])
        aligned_lens = np.array(
            [r_data.num_align - r_data.num_ins for r_data in bc_data])
        # crude mode by rounding to 1 decimal
        uniq_acc, acc_counts = np.unique(np.around(bc_acc, 1),
                                         return_counts=True)
        mode_bc_acc = uniq_acc[np.argmax(acc_counts)]
        out_fp.write(
            ACC_METRICS_TMPLT.format(
                np.median(bc_acc),
                np.mean(bc_acc),
                mode_bc_acc,
                len(bc_data),
                np.max(aligned_lens),
                np.median(aligned_lens),
                np.mean(aligned_lens),
                samp_lab,
            ))
    except FileNotFoundError:
        bc_acc = parsim_acc = aligned_lens = None
        LOGGER.info("Mappings not found for {}".format(res_dir))

    return bc_acc, parsim_acc, aligned_lens
def _main(args):
    logging.init_logger(log_fn=args.log_filename)
    if args.ground_truth_cov_min < 1:
        LOGGER.warning(
            "--ground-truth-cov-min must be 1 or greater. " + "Setting to 1."
        )
        args.ground_truth_cov_min = 1
    if args.nanopore_cov_min < 1:
        LOGGER.warning(
            "--nanopore-cov-min must be 1 or greater. " + "Setting to 1."
        )
        args.nanopore_cov_min = 1
    LOGGER.info("Checking for consistent contig names")
    mod_db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)
    check_matching_attrs(
        args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_bases
    )

    LOGGER.info("Processing batches")
    process_all_batches(
        args.processes,
        args.batch_size,
        args.ground_truth_bed,
        args.out_low_coverage_sites,
        args.out_per_site_mod_thresholds,
        mod_db_fn,
        args.strand_offset,
        args.ground_truth_cov_min,
        args.nanopore_cov_min,
        args.mod_bases,
        args.valid_sites,
    )
Пример #13
0
def _get_var_stats_queue(var_stats_q, var_conn, out_dir, ref_names_and_lens,
                         out_suffix, write_vcf_lp):
    agg_var_fn = mh.get_megalodon_fn(out_dir, mh.VAR_NAME)
    if out_suffix is not None:
        base_fn, fn_ext = os.path.splitext(agg_var_fn)
        agg_var_fn = base_fn + '.' + out_suffix + fn_ext
    agg_var_fp = variants.VcfWriter(agg_var_fn,
                                    'w',
                                    ref_names_and_lens=ref_names_and_lens,
                                    write_vcf_lp=write_vcf_lp)

    while True:
        try:
            var_var_batch = var_stats_q.get(block=True, timeout=0.01)
            if var_var_batch is None:
                continue
            for var_var in var_var_batch:
                agg_var_fp.write_variant(var_var)
        except queue.Empty:
            if var_conn.poll():
                break
            continue

    while not var_stats_q.empty():
        var_var_batch = var_stats_q.get(block=False)
        for var_var in var_var_batch:
            agg_var_fp.write_variant(var_var)
    agg_var_fp.close()
Пример #14
0
def parse_control_mods(args, out_fp, all_valid_sites):
    ctrl_acc = ctrl_parsim_acc = ctrl_dat = gt_dat = mod_chrm_sw = None
    if args.control_megalodon_results_dir is not None:
        if VERBOSE:
            sys.stderr.write('Reading control mods data\n')
        ctrl_acc, ctrl_parsim_acc = report_acc_metrics(
            args.control_megalodon_results_dir, out_fp)
        try:
            ctrl_dat = pd.read_csv(
                mh.get_megalodon_fn(args.control_megalodon_results_dir,
                                    mh.PR_MOD_TXT_NAME), sep='\t')
            if all_valid_sites is not None:
                # filter to valid sites
                ctrl_idx = ctrl_dat.set_index(['chrm', 'pos']).index
                ctrl_dat = ctrl_dat[ctrl_idx.isin(all_valid_sites)]
        except FileNotFoundError:
            ctrl_dat = None

    elif args.ground_truth_data is not None:
        if VERBOSE:
            sys.stderr.write('Reading ground truth data\n')
        gt_dat = pd.read_csv(
            args.ground_truth_data, header=None,
            names=['chrm', 'pos', 'is_mod'])
    elif args.mod_chrms_startswith is not None:
        mod_chrm_sw = args.mod_chrms_startswith
    else:
        sys.stderr.write(
            '*' * 20 + '  WARNING: No modified base ground truth provided.\n')

    return ctrl_acc, ctrl_parsim_acc, ctrl_dat, gt_dat, mod_chrm_sw
Пример #15
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)
    logger = logging.get_logger()

    logger.info('Opening new sequence variant statistics database')
    out_vars_db = variants.VarsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME),
        read_only=False,
        loc_index_in_memory=not args.var_locations_on_disk,
        uuid_index_in_memory=True)

    for mega_dir in args.megalodon_results_dirs:
        logger.info(
            'Adding sequence variant statistics from {}'.format(mega_dir))
        # full read only mode with no indices read into memory
        vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir,
                                                      mh.PR_VAR_NAME),
                                  read_only=True,
                                  chrm_index_in_memory=False,
                                  alt_index_in_memory=False,
                                  uuid_index_in_memory=False)
        bar = tqdm(desc=mega_dir,
                   total=vars_db.get_num_uniq_stats(),
                   smoothing=0,
                   dynamic_ncols=True)
        for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end,
             test_start, chrm, chrm_len) in vars_db.iter_data():
            chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len)
            loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start,
                                                      test_end, pos, ref_seq,
                                                      var_name)
            alt_id = out_vars_db.get_alt_id_or_insert(alt_seq)
            read_id = out_vars_db.get_read_id_or_insert(uuid)
            out_vars_db.insert_data(score, loc_id, alt_id, read_id)
            bar.update()
        bar.close()

    logger.info('Creating indices and closing database')
    if out_vars_db.chrm_idx_in_mem:
        out_vars_db.create_chrm_index()
    if out_vars_db.loc_idx_in_mem:
        out_vars_db.create_loc_index()
    if out_vars_db.alt_idx_in_mem:
        out_vars_db.create_alt_index()
    out_vars_db.create_data_covering_index()
    out_vars_db.close()
Пример #16
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Opening new modified base statistics database')
    out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir,
                                         mh.PR_MOD_NAME)
    out_mods_db = mods.ModsDb(
        out_mods_db_fn, read_only=False, init_db_tables=True,
        in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True,
        in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True,
        force_uint32_pos_to_dbid=args.force_uint32_pos_index,
        db_safety=args.database_safety)

    in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
                     for mega_dir in args.megalodon_results_dirs]

    LOGGER.info(
        'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' +
        'modified base definitions\n\t3) read identifiers\n\t4) reference ' +
        'positions\n\t5) modified base statistics')
    insert_chrms(in_mod_db_fns, out_mods_db)
    insert_mods(in_mod_db_fns, out_mods_db)
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    if args.single_process:
        insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    out_mods_db.db.commit()
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(
            in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size,
            args.max_processes, args.force_uint32_pos_index,
            db_safety=args.database_safety)
    out_mods_db.db.commit()

    LOGGER.info(
        'Creating data covering index for efficient searching by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.db.commit()
    out_mods_db.close()
def _main(args):
    vars_db = variants.VarsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_NAME),
        uuid_strand_index_in_memory=True,
    )
    vars_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME)
        if args.out_filename is None else args.out_filename,
        "w",
    )
    vars_txt_fp.write("\t".join(vars_db.text_field_names) + "\n")
    for (loc_id, loc_chrm, pos, ref_seq, var_name,
         test_start) in tqdm(vars_db.iter_locs(),
                             total=vars_db.get_num_uniq_var_loc(),
                             smoothing=0):
        pr_var_stats = vars_db.get_loc_stats(
            (loc_id, loc_chrm, pos, ref_seq, var_name, test_start))
        alt_type_stats = defaultdict(dict)
        for r_stats in pr_var_stats:
            alt_type_stats[r_stats.read_id][r_stats.alt_seq] = (
                r_stats.score,
                r_stats.chrm,
            )

        var_out_text = ""
        for read_id, r_var_stats in alt_type_stats.items():
            uuid, strand = vars_db.get_uuid_strand(read_id)
            alt_lps = np.array(list(zip(*r_var_stats.values()))[0])
            with np.errstate(divide="ignore"):
                ref_lp = np.log1p(-np.exp(alt_lps).sum())
            var_out_text += ("\n".join(
                (("\t".join("{}" for _ in vars_db.text_field_names)).format(
                    uuid,
                    chrm,
                    strand,
                    pos,
                    ref_lp,
                    alt_lp,
                    ref_seq,
                    alt_seq,
                    var_name,
                ) for alt_seq, (alt_lp, chrm) in r_var_stats.items())) + "\n")
        vars_txt_fp.write(var_out_text)

    return
Пример #18
0
def post_process_mapping(out_dir, map_fmt, ref_fn):
    map_bn = mh.get_megalodon_fn(out_dir, mh.MAP_NAME)
    map_fn = map_bn + '.' + map_fmt
    map_sort_fn = map_bn + '.sorted.bam'
    map_p = mp.Process(target=mapping.sort_and_index_mapping,
                       args=(map_fn, map_sort_fn, ref_fn),
                       daemon=True)
    map_p.start()
    sleep(0.01)

    return map_p
Пример #19
0
def post_process_whatshap(out_dir, map_fmt, ref_fn):
    whatshap_map_bn = mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME)
    whatshap_map_fn = whatshap_map_bn + '.' + map_fmt
    whatshap_sort_fn = whatshap_map_bn + '.sorted.bam'
    whatshap_p = mp.Process(target=mapping.sort_and_index_mapping,
                            args=(whatshap_map_fn, whatshap_sort_fn, ref_fn),
                            daemon=True)
    whatshap_p.start()
    sleep(0.01)

    return whatshap_sort_fn, whatshap_p
def parse_mod_data(args):
    if VERBOSE: sys.stderr.write('Reading megalodon data\n')
    try:
        mod_dat = pd.read_csv(mh.get_megalodon_fn(args.megalodon_results_dir,
                                                  mh.PR_MOD_TXT_NAME),
                              sep='\t')
    except FileNotFoundError:
        sys.write('ERROR: Must provide a valid Megalodon result directory.')
        sys.exit(1)

    return mod_dat
Пример #21
0
def write_signal_mappings(sig_map_q, sig_map_conn, ref_out_info, aux_failed_q):
    def apply_sig_map_offset(read_mapping):
        """Apply signal mapping shift to center coarse mappings to a registered
        signal based mapping.
        """
        if (
            ref_out_info.sig_map_offset is not None
            and ref_out_info.sig_map_offset != 0
        ):
            if ref_out_info.sig_map_offset > 0:
                # clip beginning of signal mapping and end of reference to
                # shift signal assignments to the left
                read_mapping[0]["Ref_to_signal"] = read_mapping[0][
                    "Ref_to_signal"
                ][ref_out_info.sig_map_offset :]
                read_mapping[0]["Reference"] = read_mapping[0]["Reference"][
                    : -ref_out_info.sig_map_offset
                ]
            else:
                # clip end of signal mapping and beginning of reference to
                # shift signal assignments to the right
                read_mapping[0]["Ref_to_signal"] = read_mapping[0][
                    "Ref_to_signal"
                ][: ref_out_info.sig_map_offset]
                read_mapping[0]["Reference"] = read_mapping[0]["Reference"][
                    -ref_out_info.sig_map_offset :
                ]
        return read_mapping

    def iter_mappings():
        workers_active = True
        LOGGER.debug("GetterInitComplete")
        while workers_active or not sig_map_q.empty():
            try:
                read_mapping = sig_map_q.get(timeout=0.1)
                yield apply_sig_map_offset(read_mapping)
            except queue.Empty:
                if sig_map_conn.poll():
                    workers_active = False

    try:
        LOGGER.debug("GetterStarting")
        prepare_mapping_funcs.generate_output_from_results(
            iter_mappings(),
            mh.get_megalodon_fn(ref_out_info.out_dir, mh.SIG_MAP_NAME),
            ref_out_info.alphabet_info,
            verbose=False,
        )
        LOGGER.debug("GetterClosing")
    except Exception as e:
        aux_failed_q.put(
            ("SigMapProcessingError", str(e), traceback.format_exc())
        )
Пример #22
0
def open_alignment_out_file(out_dir, map_fmt, ref_names_and_lens, ref_fn):
    map_fn = mh.get_megalodon_fn(out_dir, mh.MAP_NAME) + '.' + map_fmt
    if map_fmt == 'bam': w_mode = 'wb'
    elif map_fmt == 'cram': w_mode = 'wc'
    elif map_fmt == 'sam': w_mode = 'w'
    else:
        raise mh.MegaError('Invalid mapping output format')
    return pysam.AlignmentFile(map_fn,
                               w_mode,
                               reference_names=ref_names_and_lens[0],
                               reference_lengths=ref_names_and_lens[1],
                               reference_filename=ref_fn)
Пример #23
0
def parse_mod_data(args, out_fp):
    if VERBOSE: sys.stderr.write('Reading megalodon data\n')
    mod_acc, parsim_acc = report_acc_metrics(args.megalodon_results_dir,
                                             out_fp)

    try:
        mod_dat = pd.read_csv(mh.get_megalodon_fn(args.megalodon_results_dir,
                                                  mh.PR_MOD_TXT_NAME),
                              sep='\t')
    except FileNotFoundError:
        mod_dat = None

    return mod_dat, mod_acc, parsim_acc
Пример #24
0
def _main(args):
    logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix)
    LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""')

    mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)
    mods_db = mods.ModsDb(mods_db_fn, read_only=False)
    try:
        mods_db.check_data_covering_index_exists()
        LOGGER.info("Modified bases database index already exists")
    except mh.MegaError:
        LOGGER.info("Creating modified bases database index")
        mods_db.create_data_covering_index()
    LOGGER.debug("Closing database")
    mods_db.close()
Пример #25
0
def _get_mod_stats_queue(
    mod_stats_q,
    mod_conn,
    out_dir,
    mod_names,
    ref_names_and_lens,
    out_suffix,
    write_mod_lp,
    mod_output_fmts,
):
    def get_mod_site():
        # function for profiling purposes
        return mod_stats_q.get(block=True, timeout=0.01)

    agg_mod_bn = mh.get_megalodon_fn(out_dir, mh.MOD_NAME)
    if out_suffix is not None:
        agg_mod_bn += "." + out_suffix
    agg_mod_fps = []
    if mh.MOD_BEDMETHYL_NAME in mod_output_fmts:
        agg_mod_fps.append(mods.ModBedMethylWriter(agg_mod_bn, mod_names, "w"))
    if mh.MOD_VCF_NAME in mod_output_fmts:
        agg_mod_fps.append(
            mods.ModVcfWriter(
                agg_mod_bn,
                mod_names,
                "w",
                ref_names_and_lens=ref_names_and_lens,
                write_mod_lp=write_mod_lp,
            ))
    if mh.MOD_WIG_NAME in mod_output_fmts:
        agg_mod_fps.append(mods.ModWigWriter(agg_mod_bn, mod_names, "w"))

    while True:
        try:
            mod_sites_batch = get_mod_site()
            for mod_site in mod_sites_batch:
                for agg_mod_fp in agg_mod_fps:
                    agg_mod_fp.write_mod_site(mod_site)
        except queue.Empty:
            if mod_conn.poll():
                break
            continue

    while not mod_stats_q.empty():
        mod_sites_batch = mod_stats_q.get(block=False)
        for mod_site in mod_sites_batch:
            for agg_mod_fp in agg_mod_fps:
                agg_mod_fp.write_mod_site(mod_site)
    for agg_mod_fp in agg_mod_fps:
        agg_mod_fp.close()
Пример #26
0
 def open_alignment_out_file(self):
     map_fn = '{}.{}'.format(
         mh.get_megalodon_fn(self.out_dir, mh.MAP_NAME), self.map_fmt)
     w_mode = get_mapping_mode(self.map_fmt)
     try:
         align_file = pysam.AlignmentFile(
             map_fn, w_mode, reference_names=self.ref_names_and_lens[0],
             reference_lengths=self.ref_names_and_lens[1],
             reference_filename=self.cram_ref_fn)
     except ValueError:
         LOGGER.error(
             'Failed to open alignment file for writing.\n\t\tFor CRAM ' +
             'output, if FASTA is compressed ensure it is with bgzip or ' +
             'if --reference is a minimap2 index see --cram-reference.')
         raise mh.MegaError('Reference loading error.')
     return align_file
Пример #27
0
def _get_mod_stats_queue(mod_stats_q, mod_conn, out_dir, mod_names,
                         ref_names_and_lens, out_suffix, write_mod_lp,
                         mod_output_fmts):
    def get_mod_site():
        # function for profiling purposes
        return mod_stats_q.get(block=False)

    def do_sleep():
        # function for profiling purposes
        sleep(0.001)
        return

    agg_mod_bn = mh.get_megalodon_fn(out_dir, mh.MOD_NAME)
    if out_suffix is not None:
        agg_mod_bn += '.' + out_suffix
    agg_mod_fps = []
    if mh.MOD_BEDMETHYL_NAME in mod_output_fmts:
        agg_mod_fps.append(mods.ModBedMethylWriter(agg_mod_bn, mod_names, 'w'))
    if mh.MOD_VCF_NAME in mod_output_fmts:
        agg_mod_fps.append(
            mods.ModVcfWriter(agg_mod_bn,
                              mod_names,
                              'w',
                              ref_names_and_lens=ref_names_and_lens,
                              write_mod_lp=write_mod_lp))
    if mh.MOD_WIG_NAME in mod_output_fmts:
        agg_mod_fps.append(mods.ModWigWriter(agg_mod_bn, mod_names, 'w'))

    while True:
        try:
            mod_site = get_mod_site()
            for agg_mod_fp in agg_mod_fps:
                agg_mod_fp.write_mod_site(mod_site)
        except queue.Empty:
            if mod_conn.poll():
                break
            do_sleep()
            continue

    while not mod_stats_q.empty():
        mod_site = mod_stats_q.get(block=False)
        for agg_mod_fp in agg_mod_fps:
            agg_mod_fp.write_mod_site(mod_site)
    for agg_mod_fp in agg_mod_fps:
        agg_mod_fp.close()

    return
Пример #28
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Extracting mods and chrms from input databases')
    in_mod_db_fns = [
        mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
        for mega_dir in args.megalodon_results_dirs
    ]
    alphabet, mod_long_names = extract_mods(in_mod_db_fns)
    ref_names_and_lens = extract_chrms(in_mod_db_fns)

    LOGGER.info('Opening new per-read modified base statistics database')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    mods_info = mods.ModInfo(model_info,
                             out_dir=args.output_megalodon_results_dir)
    mods.init_mods_db(mods_info, ref_names_and_lens)

    # load uuids in memory in main out db only in single process mode.
    # else worker threads only have to load uuid lookup tables
    out_mods_db = mods.ModsDb(mods_info.mods_db_fn,
                              read_only=False,
                              in_mem_uuid_to_dbid=args.single_process)

    LOGGER.info('Inserting read UUIDs from input databases')
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db)
    # commit so read uuids are available to worker processes
    out_mods_db.commit()
    LOGGER.info('Inserting per-read calls from input databases')
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn,
                       args.data_batch_size, args.max_processes)
    out_mods_db.commit()

    LOGGER.info(
        'Creating data covering index for efficient iteration by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.commit()
    out_mods_db.close()
Пример #29
0
def _main(args):
    raise NotImplementedError(
        'Variant index creation not currently implemented.')

    logging.init_logger(args.megalodon_directory,
                        out_suffix=args.output_suffix)
    LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""')

    vars_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_VAR_NAME)
    vars_db = variants.VarsDb(vars_db_fn, read_only=False)
    try:
        vars_db.check_data_covering_index_exists()
        LOGGER.info('Variants database index already exists')
    except mh.MegaError:
        LOGGER.info('Creating variants database index')
        vars_db.create_data_covering_index()
    LOGGER.debug('Closing database')
    vars_db.close()
Пример #30
0
def _main(args):
    logging.init_logger(args.megalodon_directory,
                        out_suffix=args.output_suffix)

    # parse motifs
    motifs = parse_motifs(args.motif)
    # open indexed FASTA reference
    ref = pysam.FastaFile(args.reference)

    LOGGER.info('Extracting mods and chrms from input database')
    in_mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME))
    alphabet, _, mod_long_names = in_mods_db.get_alphabet_info()
    ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:]
    LOGGER.info('Extracting read uuid table')
    in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()]

    LOGGER.info('Opening new per-read modified base statistics databases')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    out_mods_dbs = []
    for motif_info in motifs:
        out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif,
                                    motif_info.bases_before)
        mh.mkdir(out_dir, overwrite=False)
        mods_info = mods.ModInfo(model_info, out_dir=out_dir)
        mods.init_mods_db(mods_info, ref_names_and_lens)
        out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn,
                                         read_only=False), motif_info))
        out_mods_dbs[-1][0].insert_uuids(in_uuids)
        out_mods_dbs[-1][0].commit()

    # commit so read uuids are available to worker processes
    LOGGER.info('Inserting per-read calls from input databases')
    split_data(in_mods_db, out_mods_dbs, ref)

    # TOOD do this in separate processes
    LOGGER.info(
        'Creating data covering indices for efficient iteration by position')
    for out_mods_db, _ in out_mods_dbs:
        out_mods_db.create_data_covering_index()
        out_mods_db.commit()
        out_mods_db.close()
        LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))