def write_all_wiggles(f5_dirs1, f5_dirs2, corrected_group, basecall_subgroups, stats_fn, wig_base, wig_types): if any(stat_name in wig_types for stat_name in ['stat', 'mt_stat', 'fraction']): if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) if f5_dirs1 is not None: raw_read_coverage1 = th.parse_fast5s(f5_dirs1, corrected_group, basecall_subgroups) if len(raw_read_coverage1) == 0: sys.stderr.write( '*' * 60 + '\nERROR: No reads present in --fast5-basedirs.\n' + '*' * 60 + '\n') sys.exit() group1_name = '' if f5_dirs2 is None else GROUP1_NAME if f5_dirs2 is not None: raw_read_coverage2 = th.parse_fast5s(f5_dirs2, corrected_group, basecall_subgroups) chrm_sizes = th.get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) if VERBOSE: sys.stderr.write('Writing wiggles.\n') if 'coverage' in wig_types: write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME) if 'signal_sd' in wig_types: write_signal_sd_wig(raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) if 'length' in wig_types: write_length_wig(raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) # need to do signal and difference call once either with or # w/o second set of files (unlike coverage, sds and length if 'signal' in wig_types or 'difference' in wig_types: write_signal_and_diff_wigs(raw_read_coverage1, raw_read_coverage2, chrm_sizes, wig_base, group1_name, 'signal' in wig_types, 'difference' in wig_types) elif f5_dirs1 is not None: chrm_sizes = th.get_chrm_sizes(raw_read_coverage1) if VERBOSE: sys.stderr.write('Writing wiggles.\n') if 'signal' in wig_types: write_signal_and_diff_wigs(raw_read_coverage1, None, chrm_sizes, wig_base, group1_name, 'signal' in wig_types, False) if 'coverage' in wig_types: write_cov_wig(raw_read_coverage1, wig_base, group1_name) if 'signal_sd' in wig_types: write_signal_sd_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) if 'length' in wig_types: write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) if any(stat_name in wig_types for stat_name in ['stat', 'mt_stat', 'fraction']): write_stat_wigs(all_stats, wig_base, 'stat' in wig_types, 'mt_stat' in wig_types, 'fraction' in wig_types, stat_type) return
def model_resquiggle( f5_dirs1, corr_group, bc_subgrps, tb_model_fn, z_trans_lag, p_value_thresh, reg_context, base_reg_context, max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters, compute_sd, new_corr_grp, num_processes, overwrite, in_place=True): z_thresh = ts.p_value_to_z_score(p_value_thresh) raw_read_coverage = th.parse_fast5s( f5_dirs1, corr_group, bc_subgrps, new_corr_grp) if tb_model_fn is None: tb_model_fn = ts.get_default_standard_ref_from_files(fast5_fns) # load reads into Queue manager = mp.Manager() reads_q = manager.Queue() failed_reads_q = manager.Queue() # group reads by filename so slot is not deleted in 2D reads fn_grouped_reads = defaultdict(list) for cs_reads in raw_read_coverage.itervalues(): for r_data in cs_reads: fn_grouped_reads[r_data.fn].append(r_data) num_reads = 0 for fn_reads in fn_grouped_reads.itervalues(): reads_q.put(fn_reads) num_reads += 1 mod_rsqgl_args = ( reads_q, failed_reads_q, tb_model_fn, z_trans_lag, z_thresh, reg_context, base_reg_context, max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, overwrite, in_place, corr_group) mod_rsqgl_ps = [] for p_id in xrange(num_processes): p = mp.Process(target=model_resquiggle_worker, args=mod_rsqgl_args) p.start() mod_rsqgl_ps.append(p) if VERBOSE: sys.stderr.write( 'Correcting ' + str(num_reads) + ' files with ' + str(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + 'each (Will print a dot for each ' + str(PROGRESS_INTERVAL) + ' reads completed).\n') failed_reads = defaultdict(list) while any(p.is_alive() for p in mod_rsqgl_ps): try: errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) except Queue.Empty: sleep(1) continue while not failed_reads_q.empty(): errorType, fn = failed_reads_q.get(block=False) failed_reads[errorType].append(fn) # print newline after read progress dots if VERBOSE: sys.stderr.write('\n') return dict(failed_reads)
def write_most_signif(f5_dirs, fasta_fn, num_regions, qval_thresh, corrected_group, basecall_subgroups, seqs_fn, num_bases, stat_order, stats_fn): if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats, stat_type = ts.parse_stats(stats_fn) plot_intervals = ts.get_most_signif_regions(all_stats, num_bases, num_regions, qval_thresh, fraction_order=not stat_order) # get each regions sequence either from reads or fasta index if fasta_fn is None: raw_read_coverage = th.parse_fast5s(f5_dirs, corrected_group, basecall_subgroups) all_reg_data = th.get_region_sequences(plot_intervals, raw_read_coverage) else: fasta_records = th.parse_fasta(fasta_fn) all_reg_data = [ th.intervalData(int_i.reg_id, int_i.chrm, int_i.start, int_i.end, int_i.strand, int_i.reg_text, int_i.reads, fasta_records[int_i.chrm][int_i.start:int_i.end]) for int_i in plot_intervals if int_i.chrm in fasta_records ] if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') with open(seqs_fn, 'w') as seqs_fp: for int_i in all_reg_data: reg_seq = int_i.seq if int_i.strand == '-': reg_seq = th.rev_comp(reg_seq) seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format( int_i.chrm, int(int_i.start + (num_bases / 2)), int_i.strand, int_i.reg_text, ''.join(reg_seq))) return