예제 #1
0
def write_all_wiggles(f5_dirs1, f5_dirs2, corrected_group, basecall_subgroups,
                      stats_fn, wig_base, wig_types):
    if any(stat_name in wig_types
           for stat_name in ['stat', 'mt_stat', 'fraction']):
        if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
        all_stats, stat_type = ts.parse_stats(stats_fn)

    if f5_dirs1 is not None:
        raw_read_coverage1 = th.parse_fast5s(f5_dirs1, corrected_group,
                                             basecall_subgroups)
        if len(raw_read_coverage1) == 0:
            sys.stderr.write(
                '*' * 60 + '\nERROR: No reads present in --fast5-basedirs.\n' +
                '*' * 60 + '\n')
            sys.exit()

    group1_name = '' if f5_dirs2 is None else GROUP1_NAME
    if f5_dirs2 is not None:
        raw_read_coverage2 = th.parse_fast5s(f5_dirs2, corrected_group,
                                             basecall_subgroups)
        chrm_sizes = th.get_chrm_sizes(raw_read_coverage1, raw_read_coverage2)

        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'coverage' in wig_types:
            write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME)
        if 'signal_sd' in wig_types:
            write_signal_sd_wig(raw_read_coverage2, chrm_sizes, wig_base,
                                GROUP2_NAME)
        if 'length' in wig_types:
            write_length_wig(raw_read_coverage2, chrm_sizes, wig_base,
                             GROUP2_NAME)

        # need to do signal and difference call once either with or
        # w/o second set of files (unlike coverage, sds and length
        if 'signal' in wig_types or 'difference' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, raw_read_coverage2,
                                       chrm_sizes, wig_base, group1_name,
                                       'signal' in wig_types, 'difference'
                                       in wig_types)
    elif f5_dirs1 is not None:
        chrm_sizes = th.get_chrm_sizes(raw_read_coverage1)
        if VERBOSE: sys.stderr.write('Writing wiggles.\n')
        if 'signal' in wig_types:
            write_signal_and_diff_wigs(raw_read_coverage1, None, chrm_sizes,
                                       wig_base, group1_name, 'signal'
                                       in wig_types, False)

    if 'coverage' in wig_types:
        write_cov_wig(raw_read_coverage1, wig_base, group1_name)
    if 'signal_sd' in wig_types:
        write_signal_sd_wig(raw_read_coverage1, chrm_sizes, wig_base,
                            group1_name)
    if 'length' in wig_types:
        write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name)
    if any(stat_name in wig_types
           for stat_name in ['stat', 'mt_stat', 'fraction']):
        write_stat_wigs(all_stats, wig_base, 'stat' in wig_types, 'mt_stat'
                        in wig_types, 'fraction' in wig_types, stat_type)

    return
예제 #2
0
def model_resquiggle(
        f5_dirs1, corr_group, bc_subgrps,
        tb_model_fn, z_trans_lag, p_value_thresh, reg_context, base_reg_context,
        max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters,
        compute_sd, new_corr_grp, num_processes, overwrite, in_place=True):
    z_thresh = ts.p_value_to_z_score(p_value_thresh)
    raw_read_coverage = th.parse_fast5s(
        f5_dirs1, corr_group, bc_subgrps, new_corr_grp)

    if tb_model_fn is None:
        tb_model_fn = ts.get_default_standard_ref_from_files(fast5_fns)

    # load reads into Queue
    manager = mp.Manager()
    reads_q = manager.Queue()
    failed_reads_q = manager.Queue()

    # group reads by filename so slot is not deleted in 2D reads
    fn_grouped_reads = defaultdict(list)
    for cs_reads in raw_read_coverage.itervalues():
        for r_data in cs_reads:
            fn_grouped_reads[r_data.fn].append(r_data)
    num_reads = 0
    for fn_reads in fn_grouped_reads.itervalues():
        reads_q.put(fn_reads)
        num_reads += 1

    mod_rsqgl_args = (
        reads_q, failed_reads_q, tb_model_fn, z_trans_lag, z_thresh,
        reg_context, base_reg_context, max_base_shift, b_max_base_shift,
        min_obs_per_base, base_space_iters, new_corr_grp, compute_sd,
        overwrite, in_place, corr_group)
    mod_rsqgl_ps = []
    for p_id in xrange(num_processes):
        p = mp.Process(target=model_resquiggle_worker, args=mod_rsqgl_args)
        p.start()
        mod_rsqgl_ps.append(p)

    if VERBOSE: sys.stderr.write(
            'Correcting ' + str(num_reads) + ' files with ' +
            str(len(bc_subgrps)) + ' subgroup(s)/read(s) ' +
            'each (Will print a dot for each ' + str(PROGRESS_INTERVAL) +
            ' reads completed).\n')
    failed_reads = defaultdict(list)
    while any(p.is_alive() for p in mod_rsqgl_ps):
        try:
            errorType, fn = failed_reads_q.get(block=False)
            failed_reads[errorType].append(fn)
        except Queue.Empty:
            sleep(1)
            continue
    while not failed_reads_q.empty():
        errorType, fn = failed_reads_q.get(block=False)
        failed_reads[errorType].append(fn)

    # print newline after read progress dots
    if VERBOSE: sys.stderr.write('\n')

    return dict(failed_reads)
예제 #3
0
def write_most_signif(f5_dirs, fasta_fn, num_regions, qval_thresh,
                      corrected_group, basecall_subgroups, seqs_fn, num_bases,
                      stat_order, stats_fn):
    if VERBOSE: sys.stderr.write('Loading statistics from file.\n')
    all_stats, stat_type = ts.parse_stats(stats_fn)
    plot_intervals = ts.get_most_signif_regions(all_stats,
                                                num_bases,
                                                num_regions,
                                                qval_thresh,
                                                fraction_order=not stat_order)

    # get each regions sequence either from reads or fasta index
    if fasta_fn is None:
        raw_read_coverage = th.parse_fast5s(f5_dirs, corrected_group,
                                            basecall_subgroups)
        all_reg_data = th.get_region_sequences(plot_intervals,
                                               raw_read_coverage)
    else:
        fasta_records = th.parse_fasta(fasta_fn)
        all_reg_data = [
            th.intervalData(int_i.reg_id, int_i.chrm, int_i.start, int_i.end,
                            int_i.strand, int_i.reg_text, int_i.reads,
                            fasta_records[int_i.chrm][int_i.start:int_i.end])
            for int_i in plot_intervals if int_i.chrm in fasta_records
        ]

    if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n')
    with open(seqs_fn, 'w') as seqs_fp:
        for int_i in all_reg_data:
            reg_seq = int_i.seq
            if int_i.strand == '-':
                reg_seq = th.rev_comp(reg_seq)
            seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format(
                int_i.chrm, int(int_i.start + (num_bases / 2)), int_i.strand,
                int_i.reg_text, ''.join(reg_seq)))

    return