Пример #1
0
def extract_data_worker(
        in_db_fns_q, data_q, out_mods_db_fn, batch_size, force_uint32,
        db_safety):
    # load output database with all in memory indices
    out_mods_db = mods.ModsDb(
        out_mods_db_fn, read_only=True,
        in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True,
        in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True,
        force_uint32_pos_to_dbid=force_uint32, db_safety=db_safety)
    while True:
        try:
            in_mod_db_fn = in_db_fns_q.get(block=True, timeout=1)
        except queue.Empty:
            sleep(0.001)
            continue
        if in_mod_db_fn is None:
            break

        mods_db = mods.ModsDb(in_mod_db_fn)
        batch_data = []
        for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos,
             chrm, chrm_len) in mods_db.iter_data():
            batch_data.append((score, *get_data_dbids(
                out_mods_db, chrm, strand, pos,
                (mod_base, motif, motif_pos, raw_motif), uuid)))
            if len(batch_data) >= batch_size:
                data_q.put(batch_data)
                batch_data = []
        if len(batch_data) > 0:
            data_q.put(batch_data)
        mods_db.close()
        out_mods_db.db.commit()
    out_mods_db.close()
def main():
    args = get_parser().parse_args()

    megalodon.mkdir(args.output_megalodon_results_dir, False)
    out_mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME),
        read_only=False,
        pos_index_in_memory=not args.mod_positions_on_disk)

    for mega_dir in args.megalodon_results_dirs:
        # full read only mode with no indices read into memory
        mods_db = mods.ModsDb(mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME),
                              read_only=True,
                              chrm_index_in_memory=False,
                              mod_index_in_memory=False,
                              uuid_index_in_memory=False)
        for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos,
             chrm, chrm_len) in mods_db.iter_data():
            chrm_id = out_mods_db.get_chrm_id_or_insert(chrm, chrm_len)
            pos_id = out_mods_db.get_pos_id_or_insert(chrm_id, strand, pos)
            mod_base_id = out_mods_db.get_mod_base_id_or_insert(
                mod_base, motif, motif_pos, raw_motif)
            read_id = out_mods_db.get_read_id_or_insert(uuid)
            out_mods_db.insert_data(score, pos_id, mod_base_id, read_id)

    if out_mods_db.chrm_idx_in_mem:
        out_mods_db.create_chrm_index()
    if out_mods_db.pos_idx_in_mem:
        out_mods_db.create_pos_index()
    if out_mods_db.mod_idx_in_mem:
        out_mods_db.create_mod_index()
    out_mods_db.create_data_covering_index()
    out_mods_db.close()
Пример #3
0
def insert_data(in_mod_db_fns, out_mods_db, batch_size):
    LOGGER.info('Inserting modified base data')
    total_batches = 0
    for in_mod_db_fn in in_mod_db_fns:
        in_mods_db = mods.ModsDb(in_mod_db_fn)
        total_batches += (in_mods_db.get_num_uniq_stats() // batch_size) + 1
        in_mods_db.close()
    bar = tqdm(desc='Data Batches',
               unit='Batches',
               total=total_batches,
               smoothing=0,
               dynamic_ncols=True)
    for in_mod_db_fn in in_mod_db_fns:
        in_mods_db = mods.ModsDb(in_mod_db_fn)
        batch_data = []
        for score, uuid, mod_base, in_pos_dbid in in_mods_db.iter_data():
            out_pos_dbid = out_mods_db.get_pos_dbid(
                *in_mods_db.get_pos(in_pos_dbid))
            batch_data.append(
                (score, out_pos_dbid, out_mods_db.get_mod_base_dbid(mod_base),
                 out_mods_db.get_read_dbid(uuid)))
            if len(batch_data) >= batch_size:
                out_mods_db.insert_batch_data(batch_data)
                batch_data = []
                bar.update()
        if len(batch_data) > 0:
            out_mods_db.insert_batch_data(batch_data)
            bar.update()
        in_mods_db.close()
    bar.close()
Пример #4
0
def extract_data_worker(in_db_fns_q, data_conn, out_mods_db_fn, batch_size):
    # load output database with uuid in-memory indices
    out_mods_db = mods.ModsDb(out_mods_db_fn, in_mem_uuid_to_dbid=True)
    while True:
        try:
            in_mod_db_fn = in_db_fns_q.get(block=True, timeout=0.1)
        except queue.Empty:
            sleep(0.001)
            continue
        if in_mod_db_fn is None:
            break

        in_mods_db = mods.ModsDb(in_mod_db_fn)
        batch_data = []
        for score, uuid, mod_base, in_pos_dbid in in_mods_db.iter_data():
            out_pos_dbid = out_mods_db.get_pos_dbid(
                *in_mods_db.get_pos(in_pos_dbid))
            batch_data.append(
                (score, out_pos_dbid, out_mods_db.get_mod_base_dbid(mod_base),
                 out_mods_db.get_read_dbid(uuid)))
            if len(batch_data) >= batch_size:
                data_conn.put(batch_data)
                batch_data = []
        if len(batch_data) > 0:
            data_conn.put(batch_data)
            batch_data = []
        in_mods_db.close()
    out_mods_db.close()
Пример #5
0
def main():
    args = get_parser().parse_args()

    old_db = sqlite3.connect(args.old_db)
    old_cur = old_db.cursor()
    new_db = mods.ModsDb(args.new_db,
                         read_only=False,
                         pos_index_in_memory=True)

    sys.stderr.write('Reading/loading reference record names.\n')
    fill_refs(old_cur, new_db)

    sys.stderr.write('Reading/loading modified base scores.\n')
    fill_mods(old_cur, new_db)

    if not DEBUG:
        new_db.create_mod_index()
        t0 = time()
        sys.stderr.write('Creating positions index.\n')
        new_db.create_pos_index()
        t1 = time()
        sys.stderr.write('Took {} seconds.\n'.format(t1 - t0))
        sys.stderr.write('Creating scores position index.\n')
        new_db.create_data_covering_index()
        sys.stderr.write('Took {} seconds.\n'.format(time() - t1))
    new_db.close()
def _main(args):
    logging.init_logger()
    old_db = sqlite3.connect(args.old_db)
    old_cur = old_db.cursor()
    new_db = mods.ModsDb(args.new_db,
                         read_only=False,
                         pos_index_in_memory=True)

    LOGGER.info('Reading/loading reference record names.')
    fill_refs(old_cur, new_db)

    LOGGER.info('Reading/loading modified base scores.')
    fill_mods(old_cur, new_db)

    if not DEBUG:
        new_db.create_mod_index()
        t0 = time()
        LOGGER.info('Creating positions index.')
        new_db.create_pos_index()
        t1 = time()
        LOGGER.info('Took {} seconds.'.format(t1 - t0))
        LOGGER.info('Creating scores position index.')
        new_db.create_data_covering_index()
        LOGGER.info('Took {} seconds.'.format(time() - t1))
    new_db.close()
Пример #7
0
def insert_pos_mp(in_mod_db_fns, out_mods_db, batch_size):
    LOGGER.info('Merging pos tables using multiprocessing')
    total_batches = 0
    pos_q = mp.Queue(maxsize=QUEUE_SIZE_LIMIT)
    pos_ps = []
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        total_batches += (mods_db.get_num_uniq_mod_pos() // batch_size) + 1
        mods_db.close()
        p = mp.Process(
            target=extract_pos_worker,
            args=(in_mod_db_fn, batch_size, pos_q), daemon=True)
        p.start()
        pos_ps.append(p)

    bar = tqdm(desc='Position Batches', total=total_batches,
               smoothing=0, dynamic_ncols=True)
    while any(p.is_alive() for p in pos_ps):
        try:
            pos_batch = pos_q.get(block=True, timeout=1)
        except queue.Empty:
            sleep(0.001)
            continue
        insert_pos_data(pos_batch, out_mods_db)
        bar.update()
    while not pos_q.empty():
        pos_batch = pos_q.get(block=False)
        insert_pos_data(pos_batch, out_mods_db)
        bar.update()
    bar.close()
Пример #8
0
def insert_reads_mp(in_mod_db_fns, out_mods_db, batch_size):
    LOGGER.info('Merging read uuid tables using multiprocessing')
    total_batches = 0
    uuids_q = mp.Queue()
    uuids_ps = []
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        total_batches += (mods_db.get_num_uniq_reads() // batch_size) + 1
        mods_db.close()
        p = mp.Process(
            target=extract_reads_worker,
            args=(in_mod_db_fn, batch_size, uuids_q), daemon=True)
        p.start()
        uuids_ps.append(p)

    bar = tqdm(desc='UUID Batches', total=total_batches,
               smoothing=0, dynamic_ncols=True)
    while any(p.is_alive() for p in uuids_ps):
        try:
            uuids_batch = uuids_q.get(block=True, timeout=1)
        except queue.Empty:
            sleep(0.001)
            continue
        out_mods_db.get_read_dbids_or_insert(uuids_batch)
        bar.update()
    while not uuids_q.empty():
        uuids_batch = uuids_q.get(block=False)
        out_mods_db.get_read_dbids_or_insert(uuids_batch)
        bar.update()
    bar.close()
Пример #9
0
def _main(args):
    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME))
    mods_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME)
        if args.out_filename is None else args.out_filename, 'w')
    mods_txt_fp.write('\t'.join(mods_db.text_field_names) + '\n')
    for pos_id, pos_chrm, strand, pos in tqdm(
            mods_db.iter_pos(), total=mods_db.get_num_uniq_mod_pos(),
            smoothing=0):
        pr_mod_stats = mods_db.get_pos_stats(
            (pos_id, pos_chrm, strand, pos), return_uuids=True)
        mod_type_stats = defaultdict(dict)
        for r_stats in pr_mod_stats:
            mod_type_stats[r_stats.read_id][r_stats.mod_base] = (
                r_stats.score, r_stats.raw_motif, r_stats.motif_pos,
                r_stats.chrm)

        mod_out_text = ''
        for read_id, r_mod_stats in mod_type_stats.items():
            mod_lps = np.array(list(zip(*r_mod_stats.values()))[0])
            with np.errstate(divide='ignore'):
                can_lp = np.log1p(-np.exp(mod_lps).sum())
            mod_out_text += '\n'.join((
                ('\t'.join('{}' for _ in mods_db.text_field_names)).format(
                    read_id, chrm, strand, pos, mod_lp,
                    can_lp, mod_base, '{}:{}'.format(raw_motif, motif_pos))
                for mod_base, (mod_lp, raw_motif, motif_pos, chrm) in
                r_mod_stats.items())) + '\n'
        mods_txt_fp.write(mod_out_text)
Пример #10
0
def check_matching_attrs(ground_truth_bed,
                         strand_offset,
                         mod_db_fn,
                         target_mod_bases,
                         limit=10000):
    mods_db = mods.ModsDb(mod_db_fn)
    db_strands = (1, -1) if strand_offset is None else (None, )
    db_chrms = set((chrm, strand) for _, chrm, _ in mods_db.iter_chrms()
                   for strand in db_strands)
    cov, mod_cov = mh.parse_bed_methyls([
        ground_truth_bed,
    ],
                                        strand_offset,
                                        show_prog_bar=False,
                                        limit=limit)
    if len(db_chrms.intersection(cov.keys())) == 0:
        LOGGER.error(('Using first {} sites from {}, found zero overlapping ' +
                      'contig/chromosome names with the mod database.').format(
                          limit, ground_truth_bed))
        LOGGER.info('Database contigs/chromosomes: {}'.format(', '.join(
            map(str, db_chrms))))
        LOGGER.info('BED methyl contigs/chromosomes: {}'.format(', '.join(
            map(str, list(cov.keys())))))
        raise mh.MegaError('No overlapping contigs found.')
    db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names())
    for tmb in target_mod_bases:
        if tmb not in db_mods:
            raise mh.MegaError(
                ('Target modified base, {}, not found in mods database ' +
                 '({}).').format(tmb, ', '.join(db_mods)))
    mods_db.check_data_covering_index_exists()
    mods_db.close()
def _main(args):
    raise NotImplementedError(
        'The previous version of this script updated version 0 to ' +
        'version 1. Updgreade to version 2 not yet implemented.')
    logging.init_logger()
    old_db = sqlite3.connect(args.old_db)
    old_cur = old_db.cursor()
    new_db = mods.ModsDb(args.new_db, read_only=False)

    LOGGER.info('Reading/loading reference record names.')
    fill_refs(old_cur, new_db)

    LOGGER.info('Reading/loading modified base scores.')
    fill_mods(old_cur, new_db)

    if not DEBUG:
        new_db.create_mod_index()
        t0 = time()
        LOGGER.info('Creating positions index.')
        new_db.create_pos_index()
        t1 = time()
        LOGGER.info('Took {} seconds.'.format(t1 - t0))
        LOGGER.info('Creating scores position index.')
        new_db.create_data_covering_index()
        LOGGER.info('Took {} seconds.'.format(time() - t1))
    new_db.close()
Пример #12
0
def _main(args):
    logging.init_logger(args.megalodon_directory,
                        out_suffix=args.output_suffix)

    # parse motifs
    motifs = parse_motifs(args.motif)
    # open indexed FASTA reference
    ref = pysam.FastaFile(args.reference)

    LOGGER.info('Extracting mods and chrms from input database')
    in_mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME))
    alphabet, _, mod_long_names = in_mods_db.get_alphabet_info()
    ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:]
    LOGGER.info('Extracting read uuid table')
    in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()]

    LOGGER.info('Opening new per-read modified base statistics databases')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    out_mods_dbs = []
    for motif_info in motifs:
        out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif,
                                    motif_info.bases_before)
        mh.mkdir(out_dir, overwrite=False)
        mods_info = mods.ModInfo(model_info, out_dir=out_dir)
        mods.init_mods_db(mods_info, ref_names_and_lens)
        out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn,
                                         read_only=False), motif_info))
        out_mods_dbs[-1][0].insert_uuids(in_uuids)
        out_mods_dbs[-1][0].commit()

    # commit so read uuids are available to worker processes
    LOGGER.info('Inserting per-read calls from input databases')
    split_data(in_mods_db, out_mods_dbs, ref)

    # TOOD do this in separate processes
    LOGGER.info(
        'Creating data covering indices for efficient iteration by position')
    for out_mods_db, _ in out_mods_dbs:
        out_mods_db.create_data_covering_index()
        out_mods_db.commit()
        out_mods_db.close()
        LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))
Пример #13
0
def insert_reads(in_mod_db_fns, out_mods_db):
    LOGGER.info('Merging read uuid tables')
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_reads(),
                   smoothing=0, dynamic_ncols=True)
        for read_dbid, uuid in mods_db.iter_uuids():
            out_mods_db.get_read_dbid_or_insert(uuid)
            bar.update()
        mods_db.close()
        bar.close()
Пример #14
0
def extract_reads_worker(in_mod_db_fn, batch_size, uuids_q):
    mods_db = mods.ModsDb(in_mod_db_fn)
    uuids_batch = []
    for read_dbid, uuid in mods_db.iter_uuids():
        uuids_batch.append(uuid)
        if len(uuids_batch) >= batch_size:
            uuids_q.put(uuids_batch)
            uuids_batch = []
    if len(uuids_batch) >= 0:
        uuids_q.put(uuids_batch)
    mods_db.close()
Пример #15
0
def insert_reads(in_mod_db_fns, out_mods_db):
    LOGGER.info('Merging read uuid tables')
    in_uuids = set()
    for in_mod_db_fn in tqdm(in_mod_db_fns,
                             desc='Databases',
                             unit='DBs',
                             smoothing=0,
                             dynamic_ncols=True):
        in_mods_db = mods.ModsDb(in_mod_db_fn)
        in_uuids.update(uuid for _, uuid in in_mods_db.iter_uuids())
        in_mods_db.close()
    out_mods_db.insert_uuids(in_uuids)
Пример #16
0
def extract_mods(in_mod_db_fns):
    LOGGER.info('Merging mod tables')
    # collect modified base definitions from input databases
    mod_base_to_can = dict()
    for in_mod_db_fn in tqdm(in_mod_db_fns,
                             desc='Databases',
                             unit='DBs',
                             smoothing=0,
                             dynamic_ncols=True):
        mods_db = mods.ModsDb(in_mod_db_fn)
        for mod_base, can_base, mln in mods_db.get_full_mod_data():
            if mod_base in mod_base_to_can and \
               (can_base, mln) != mod_base_to_can[mod_base]:
                raise mh.MegaError(
                    'Modified base associated with mutliple canonical bases ' +
                    'or long names in different databases. {} != {}'.format(
                        str((can_base, mln)), str(mod_base_to_can[mod_base])))
            mod_base_to_can[mod_base] = (can_base, mln)
    # check that mod long names are unique
    mlns = [mln for _, mln in mod_base_to_can.values()]
    if len(mlns) != len(set(mlns)):
        raise mh.MegaError(
            'Modified base long name assigned to more than one modified ' +
            'base single letter code.')

    # extract canonical bases associated with modified base
    can_bases = set(can_base for can_base, _ in mod_base_to_can.values())
    # determine first valid canonical alphabet compatible with database
    can_alphabet = None
    for v_alphabet in mh.VALID_ALPHABETS:
        if len(can_bases.difference(v_alphabet)) == 0:
            can_alphabet = v_alphabet
            break
    if can_alphabet is None:
        LOGGER.error(
            'Mods database does not contain valid canonical bases ({})'.format(
                ''.join(sorted(can_bases))))
        raise mh.MegaError('Invalid alphabet.')

    # compute full output alphabet and ordered modified base long names
    can_base_to_mods = dict(
        (can_base, [(mod_base, mln)
                    for mod_base, (mcan_base, mln) in mod_base_to_can.items()
                    if mcan_base == can_base]) for can_base in can_alphabet)
    alphabet = ''
    mod_long_names = []
    for can_base in can_alphabet:
        alphabet += can_base
        for mod_base, mln in can_base_to_mods[can_base]:
            alphabet += mod_base
            mod_long_names.append(mln)

    return alphabet, mod_long_names
Пример #17
0
def extract_pos_worker(in_mod_db_fn, batch_size, pos_q):
    mods_db = mods.ModsDb(in_mod_db_fn)
    pos_batch = init_pos_dict(mods_db)
    num_pos = 0
    for _, chrm_dbid, strand, pos in mods_db.iter_pos():
        pos_batch[(mods_db.get_chrm(chrm_dbid)[0], strand)].append(pos)
        num_pos += 1
        if num_pos >= batch_size:
            pos_q.put(pos_batch)
            pos_batch = init_pos_dict(mods_db)
            num_pos = 0
    if num_pos >= 0:
        pos_q.put(pos_batch)
    mods_db.close()
Пример #18
0
def _main(args):
    logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix)
    LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""')

    mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)
    mods_db = mods.ModsDb(mods_db_fn, read_only=False)
    try:
        mods_db.check_data_covering_index_exists()
        LOGGER.info("Modified bases database index already exists")
    except mh.MegaError:
        LOGGER.info("Creating modified bases database index")
        mods_db.create_data_covering_index()
    LOGGER.debug("Closing database")
    mods_db.close()
Пример #19
0
def _main(args):
    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME),
        in_mem_dbid_to_uuid=True,
    )
    mods_txt_fp = open(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME)
        if args.out_filename is None else args.out_filename,
        "w",
    )
    mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n")
    rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n"
    bar = tqdm(
        desc="Processing Per-read Data",
        unit="per-read calls",
        total=mods_db.get_num_uniq_stats(),
        smoothing=0,
        dynamic_ncols=True,
    )
    for (chrm, strand,
         pos), pos_lps in mods_db.iter_pos_scores(convert_pos=True):
        bar.update(len(pos_lps))
        str_strand = mh.int_strand_to_str(strand)
        mod_out_text = ""
        prev_dbid = None
        mod_bs, r_lps = [], []
        for read_dbid, mod_dbid, lp in sorted(pos_lps):
            if prev_dbid != read_dbid and prev_dbid is not None:
                uuid = mods_db.get_uuid(prev_dbid)
                # compute and store log likelihood ratios
                with np.errstate(divide="ignore"):
                    can_lp = np.log1p(-np.exp(r_lps).sum())
                for mod_b, r_lp in zip(mod_bs, r_lps):
                    mod_out_text += rec_tmplt.format(uuid, chrm, str_strand,
                                                     pos, r_lp, can_lp, mod_b)
                mod_bs, r_lps = [], []
            prev_dbid = read_dbid
            mod_bs.append(mods_db.get_mod_base(mod_dbid))
            r_lps.append(lp)
        uuid = mods_db.get_uuid(prev_dbid)
        # compute and store log likelihood ratios
        with np.errstate(divide="ignore"):
            can_lp = np.log1p(-np.exp(r_lps).sum())
        for mod_b, r_lp in zip(mod_bs, r_lps):
            mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp,
                                             can_lp, mod_b)
        mods_txt_fp.write(mod_out_text)
    mods_txt_fp.close()
Пример #20
0
def insert_mods(in_mod_db_fns, out_mods_db):
    LOGGER.info('Merging mod tables')
    all_mod_long_names = set()
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        all_mod_long_names.update(mods_db.get_mod_long_names())
        bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_mods(),
                   smoothing=0, dynamic_ncols=True)
        for (_, mod_base, motif, motif_pos,
             raw_motif) in mods_db.iter_mod_bases():
            out_mods_db.get_mod_base_dbid_or_insert(
                mod_base, motif, motif_pos, raw_motif)
            bar.update()
        mods_db.close()
        bar.close()
    out_mods_db.insert_mod_long_names(list(all_mod_long_names))
Пример #21
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Opening new modified base statistics database')
    out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir,
                                         mh.PR_MOD_NAME)
    out_mods_db = mods.ModsDb(
        out_mods_db_fn, read_only=False, init_db_tables=True,
        in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True,
        in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True,
        force_uint32_pos_to_dbid=args.force_uint32_pos_index,
        db_safety=args.database_safety)

    in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
                     for mega_dir in args.megalodon_results_dirs]

    LOGGER.info(
        'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' +
        'modified base definitions\n\t3) read identifiers\n\t4) reference ' +
        'positions\n\t5) modified base statistics')
    insert_chrms(in_mod_db_fns, out_mods_db)
    insert_mods(in_mod_db_fns, out_mods_db)
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    if args.single_process:
        insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size)
    out_mods_db.db.commit()
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(
            in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size,
            args.max_processes, args.force_uint32_pos_index,
            db_safety=args.database_safety)
    out_mods_db.db.commit()

    LOGGER.info(
        'Creating data covering index for efficient searching by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.db.commit()
    out_mods_db.close()
Пример #22
0
def insert_chrms(in_mod_db_fns, out_mods_db):
    LOGGER.info('Merging chrm tables')
    ref_names_and_lens = [[], []]
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_chrms(),
                   smoothing=0, dynamic_ncols=True)
        for _, chrm, chrm_len in mods_db.iter_chrms():
            if chrm not in ref_names_and_lens[0]:
                ref_names_and_lens[0].append(chrm)
                ref_names_and_lens[1].append(chrm_len)
            bar.update()
        mods_db.close()
        bar.close()
    # insert chrms at the end to avoid errors for in memory position datasets
    out_mods_db.insert_chrms(ref_names_and_lens)
    out_mods_db.create_chrm_index()
Пример #23
0
def _main(args):
    mh.mkdir(args.output_megalodon_results_dir, args.overwrite)
    logging.init_logger(args.output_megalodon_results_dir)

    LOGGER.info('Extracting mods and chrms from input databases')
    in_mod_db_fns = [
        mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME)
        for mega_dir in args.megalodon_results_dirs
    ]
    alphabet, mod_long_names = extract_mods(in_mod_db_fns)
    ref_names_and_lens = extract_chrms(in_mod_db_fns)

    LOGGER.info('Opening new per-read modified base statistics database')
    model_info = backends.DetachedModelInfo(alphabet=alphabet,
                                            mod_long_names=mod_long_names)
    mods_info = mods.ModInfo(model_info,
                             out_dir=args.output_megalodon_results_dir)
    mods.init_mods_db(mods_info, ref_names_and_lens)

    # load uuids in memory in main out db only in single process mode.
    # else worker threads only have to load uuid lookup tables
    out_mods_db = mods.ModsDb(mods_info.mods_db_fn,
                              read_only=False,
                              in_mem_uuid_to_dbid=args.single_process)

    LOGGER.info('Inserting read UUIDs from input databases')
    if args.single_process:
        insert_reads(in_mod_db_fns, out_mods_db)
    else:
        insert_reads_mp(in_mod_db_fns, out_mods_db)
    # commit so read uuids are available to worker processes
    out_mods_db.commit()
    LOGGER.info('Inserting per-read calls from input databases')
    if args.single_process:
        insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size)
    else:
        insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn,
                       args.data_batch_size, args.max_processes)
    out_mods_db.commit()

    LOGGER.info(
        'Creating data covering index for efficient iteration by position')
    out_mods_db.create_data_covering_index()
    out_mods_db.commit()
    out_mods_db.close()
Пример #24
0
def insert_pos(in_mod_db_fns, out_mods_db, batch_size):
    LOGGER.info('Merging pos tables')
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        num_pos = 0
        pos_batch = init_pos_dict(mods_db)
        bar = tqdm(desc=in_mod_db_fn, total=mods_db.get_num_uniq_mod_pos(),
                   smoothing=0, dynamic_ncols=True)
        for _, chrm_dbid, strand, pos in mods_db.iter_pos():
            pos_batch[(mods_db.get_chrm(chrm_dbid)[0], strand)].append(pos)
            num_pos += 1
            if num_pos >= batch_size:
                insert_pos_data(pos_batch, out_mods_db)
                num_pos = 0
                pos_batch = init_pos_dict(mods_db)
            bar.update()
        if num_pos > 0:
            insert_pos_data(pos_batch, out_mods_db)
        mods_db.close()
        bar.close()
Пример #25
0
def main():
    args = get_parser().parse_args()

    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME))
    scores = []
    num_pos = (mods_db.get_num_uniq_mod_pos()
               if args.num_positions is None else args.num_positions)
    for n_pos, (pos_id, pos_chrm, strand,
                pos) in tqdm(enumerate(mods_db.iter_pos()),
                             total=num_pos,
                             smoothing=0):
        pr_mod_stats = mods_db.get_pos_stats((pos_id, pos_chrm, strand, pos),
                                             return_uuids=True)
        mod_type_stats = defaultdict(dict)
        for r_stats in pr_mod_stats:
            mod_type_stats[r_stats.read_id][r_stats.mod_base] = r_stats.score
        for r_mod_stats in mod_type_stats.values():
            mod_lps = np.array(list(r_mod_stats.values()))
            with np.errstate(divide='ignore'):
                can_lp = np.log1p(-np.exp(mod_lps).sum())
            for mod_base, mod_lp in r_mod_stats.items():
                if mod_base != args.mod_base:
                    continue
                scores.append(can_lp - mod_lp)

        if args.num_positions is not None and n_pos >= args.num_positions:
            break

    scores = np.array(scores)
    frac_mod = args.fraction_modified
    if frac_mod is None:
        thresh_vals = np.percentile(
            scores, (args.mod_percentile, 100 - args.mod_percentile))
        thresh_val = np.abs(thresh_vals).min()
        n_can = np.greater_equal(scores, thresh_val).sum()
        n_mod = np.less_equal(scores, -thresh_val).sum()
        frac_mod = n_mod / (n_mod + n_can)
        print('Fraction mod: {}'.format(frac_mod))
    llr_thresh = np.percentile(scores, frac_mod * 100)
    print('Threshold: {}'.format(llr_thresh))
Пример #26
0
def insert_data(in_mod_db_fns, out_mods_db, batch_size):
    LOGGER.info('Inserting modified base data')
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        bar = tqdm(
            desc=in_mod_db_fn, total=mods_db.get_num_uniq_stats(), smoothing=0,
            dynamic_ncols=True)
        batch_data = []
        for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand,
             pos, chrm, chrm_len) in mods_db.iter_data():
            batch_data.append((score, *get_data_dbids(
                out_mods_db, chrm, strand, pos,
                (mod_base, motif, motif_pos, raw_motif), uuid)))
            if len(batch_data) >= batch_size:
                out_mods_db.insert_read_data(batch_data)
                batch_data = []
            bar.update()
        if len(batch_data) > 0:
            out_mods_db.insert_read_data(batch_data)
        mods_db.close()
        bar.close()
Пример #27
0
def insert_data_mp(
        in_mod_db_fns, out_mods_db, out_mods_db_fn, batch_size, max_proc,
        force_uint32, db_safety):
    LOGGER.info('Merging modified base data using multiprocessing')
    num_proc = min(max_proc, len(in_mod_db_fns))
    in_db_fns_q = mp.Queue()
    for in_mod_db_fn in in_mod_db_fns:
        in_db_fns_q.put(in_mod_db_fn)
    for _ in range(num_proc):
        in_db_fns_q.put(None)
    data_q = mp.Queue(maxsize=QUEUE_SIZE_LIMIT)
    data_ps = []
    for _ in range(num_proc):
        p = mp.Process(
            target=extract_data_worker,
            args=(in_db_fns_q, data_q, out_mods_db_fn, batch_size,
                  force_uint32, db_safety), daemon=True)
        p.start()
        data_ps.append(p)

    total_batches = 0
    for in_mod_db_fn in in_mod_db_fns:
        mods_db = mods.ModsDb(in_mod_db_fn)
        total_batches += (mods_db.get_num_uniq_stats() // batch_size) + 1
        mods_db.close()
    bar = tqdm(desc='Statistics Batches', total=total_batches,
               smoothing=0, dynamic_ncols=True)
    while any(p.is_alive() for p in data_ps):
        try:
            batch_data = data_q.get(block=True, timeout=1)
        except queue.Empty:
            sleep(0.001)
            continue
        out_mods_db.insert_read_data(batch_data)
        bar.update()
    while not data_q.empty():
        batch_data = data_q.get(block=False)
        out_mods_db.insert_read_data(batch_data)
        bar.update()
    bar.close()
Пример #28
0
def insert_data_mp(in_mod_db_fns, out_mods_db, out_mods_db_fn, batch_size,
                   max_proc):
    LOGGER.info('Merging modified base data using multiprocessing')
    num_proc = min(max_proc, len(in_mod_db_fns))
    in_db_fns_q = mp.Queue()
    for in_mod_db_fn in in_mod_db_fns:
        in_db_fns_q.put(in_mod_db_fn)
    for _ in range(num_proc):
        in_db_fns_q.put(None)
    data_q = mega_mp.SimplexManyToOneQueue(max_size=QUEUE_SIZE_LIMIT)
    data_ps = []
    for _ in range(num_proc):
        data_conn = data_q.get_conn()
        p = mp.Process(target=extract_data_worker,
                       args=(in_db_fns_q, data_conn, out_mods_db_fn,
                             batch_size),
                       daemon=True)
        p.start()
        data_conn.close()
        del data_conn
        data_ps.append(p)

    total_batches = 0
    for in_mod_db_fn in in_mod_db_fns:
        in_mods_db = mods.ModsDb(in_mod_db_fn)
        total_batches += (in_mods_db.get_num_uniq_stats() // batch_size) + 1
        in_mods_db.close()
    bar = tqdm(desc='Data Batches',
               unit='Batches',
               total=total_batches,
               smoothing=0,
               dynamic_ncols=True)

    while data_q.has_valid_conns:
        for batch_data in data_q.wait_recv():
            out_mods_db.insert_batch_data(batch_data)
            bar.update()
    bar.close()
Пример #29
0
def extract_chrms(in_mod_db_fns):
    LOGGER.info('Merging chrm tables')
    ref_names_and_lens = [[], []]
    for in_mod_db_fn in tqdm(in_mod_db_fns,
                             desc='Databases',
                             unit='DBs',
                             smoothing=0,
                             dynamic_ncols=True):
        mods_db = mods.ModsDb(in_mod_db_fn)
        for _, chrm, chrm_len in mods_db.iter_chrms():
            if chrm in ref_names_and_lens[0]:
                prev_chrm_len = ref_names_and_lens[1][
                    ref_names_and_lens[0].index(chrm)]
                if prev_chrm_len != chrm_len:
                    raise mh.MegaError(
                        ('Chromosome lengths from databases do not agree ' +
                         'for {}: {} != {}').format(chrm, prev_chrm_len,
                                                    chrm_len))
            else:
                ref_names_and_lens[0].append(chrm)
                ref_names_and_lens[1].append(chrm_len)
        mods_db.close()
    return ref_names_and_lens
Пример #30
0
def _main(args):
    logging.init_logger()

    LOGGER.info('Loading database position statistics')
    mods_db = mods.ModsDb(
        mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME))
    db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names())
    if args.mod_base not in db_mods:
        raise mh.MegaError('Target modified base not found in mods database.')

    scores = []
    bar = tqdm(total=args.num_statistics, smoothing=0)
    for (chrm, strand,
         pos), mod_llrs in mods_db.iter_pos_scores(convert_pos=True,
                                                   compute_llrs=True):
        for mod_base, reads_llrs in mod_llrs.items():
            if mod_base != args.mod_base:
                continue
            bar.update(len(reads_llrs))
            scores.extend(reads_llrs)
        if args.num_statistics is not None and bar.n >= args.num_statistics:
            break

    LOGGER.info('Esitmating fraction of modified bases')
    scores = np.array(scores)
    frac_mod = args.fraction_modified
    if frac_mod is None:
        thresh_vals = np.percentile(
            scores, (args.mod_percentile, 100 - args.mod_percentile))
        thresh_val = np.abs(thresh_vals).min()
        n_can = np.greater_equal(scores, thresh_val).sum()
        n_mod = np.less_equal(scores, -thresh_val).sum()
        frac_mod = n_mod / (n_mod + n_can)
        print('Fraction mod: {}'.format(frac_mod))
    llr_thresh = np.percentile(scores, frac_mod * 100)
    print('Threshold: {}'.format(llr_thresh))