def write_all_wiggles(files1, files2, corrected_group, basecall_subgroups, obs_filter, test_type, min_test_vals, stats_fn, fishers_method_offset, wig_base, wig_types): stats_file_exists = stats_fn is not None and os.path.isfile(stats_fn) include_stats = 'pvals' in wig_types or 'qvals' in wig_types if include_stats and stats_file_exists: if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats = ns.parse_stats(stats_fn) if VERBOSE: sys.stderr.write('Parsing FAST5 files.\n') raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group, basecall_subgroups) raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter) group1_name = '' if files2 is None else 'group1' if files2 is not None: raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group, basecall_subgroups) raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter) if include_stats and not stats_file_exists: if VERBOSE: sys.stderr.write('Calculating statistics.\n') all_stats = ns.get_all_significance(raw_read_coverage1, raw_read_coverage2, test_type, min_test_vals, stats_fn, fishers_method_offset) if VERBOSE: sys.stderr.write('Writing wiggles.\n') if 'coverage' in wig_types: write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME) if 'signal_sd' in wig_types: write_signal_sd_wig(raw_read_coverage2, wig_base, GROUP2_NAME) if 'length' in wig_types: write_length_wig(raw_read_coverage2, wig_base, GROUP2_NAME) # need to do signal and difference call once either with or # w/o second set of files (unlike coverage, sds and length if 'signal' in wig_types or 'difference' in wig_types: write_signal_and_diff_wigs(raw_read_coverage1, raw_read_coverage2, wig_base, group1_name, 'signal' in wig_types, 'difference' in wig_types) else: if VERBOSE: sys.stderr.write('Writing wiggles.\n') if 'signal' in wig_types or 'difference' in wig_types: write_signal_and_diff_wigs(raw_read_coverage1, None, wig_base, group1_name, 'signal' in wig_types, 'difference' in wig_types) if 'coverage' in wig_types: write_cov_wig(raw_read_coverage1, wig_base, group1_name) if 'signal_sd' in wig_types: write_signal_sd_wig(raw_read_coverage1, wig_base, group1_name) if 'length' in wig_types: write_length_wig(raw_read_coverage1, wig_base, group1_name) if 'pvals' in wig_types or 'qvals' in wig_types: write_pvals_and_qvals_wig(all_stats, wig_base, 'pvals' in wig_types, 'qvals' in wig_types) return
def write_most_signif(files1, files2, num_regions, qval_thresh, corrected_group, basecall_subgroups, seqs_fn, num_bases, test_type, obs_filter, min_test_vals, stats_fn, fasta_fn, fishers_method_offset): calc_stats = stats_fn is None or not os.path.isfile(stats_fn) if not calc_stats: if VERBOSE: sys.stderr.write('Loading statistics from file.\n') all_stats = ns.parse_stats(stats_fn) if calc_stats or fasta_fn is None: if VERBOSE: sys.stderr.write('Parsing files.\n') raw_read_coverage1 = nh.parse_fast5s(files1, corrected_group, basecall_subgroups) raw_read_coverage2 = nh.parse_fast5s(files2, corrected_group, basecall_subgroups) raw_read_coverage1 = nh.filter_reads(raw_read_coverage1, obs_filter) raw_read_coverage2 = nh.filter_reads(raw_read_coverage2, obs_filter) if calc_stats: if VERBOSE: sys.stderr.write('Calculating statistics.\n') all_stats = ns.get_all_significance(raw_read_coverage1, raw_read_coverage2, test_type, min_test_vals, stats_fn, fishers_method_offset) plot_intervals = ns.get_most_signif_regions(all_stats, num_bases, num_regions, qval_thresh) if fasta_fn is None: reg_seqs = get_region_sequences(plot_intervals, raw_read_coverage1, raw_read_coverage2, num_bases, corrected_group) else: fasta_records = nh.parse_fasta(fasta_fn) reg_seqs = [(p_int, fasta_records[chrm][start:start + num_bases]) for p_int, (chrm, start, strand, reg_name) in plot_intervals if chrm in fasta_records] # get reads overlapping each region if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') with open(seqs_fn, 'w') as seqs_fp: for reg_i, reg_seq in reg_seqs: chrm, start, strand, stat = next( p_int for p_reg_i, p_int in plot_intervals if p_reg_i == reg_i) if strand == '-': reg_seq = nh.rev_comp(reg_seq) seqs_fp.write('>{0}::{1:d}::{2} {3}\n{4}\n'.format( chrm, start, strand, stat, ''.join(reg_seq))) return
def write_wiggle(files, corrected_group, wiggle_fn, basecall_subgroups, obs_filter): if VERBOSE: sys.stderr.write('Parsing files.\n') raw_read_coverage = parse_fast5s(files, corrected_group, basecall_subgroups) raw_read_coverage = filter_reads(raw_read_coverage, obs_filter) if VERBOSE: sys.stderr.write('Calculating read coverage.\n') wiggle_cov = [] for chrom, reads_data in raw_read_coverage.items(): max_end = max(r_data.end for r_data in reads_data) chrom_coverage = np.zeros(max_end, dtype=np.int_) for r_data in reads_data: chrom_coverage[r_data.start:r_data.end] += 1 wiggle_cov.append((chrom, chrom_coverage)) if VERBOSE: sys.stderr.write('Writing wiggle.\n') with open(wiggle_fn, 'w') as wig_fp: wig_fp.write( 'track type=wiggle_0 name={0} description={0}\n'.format(wiggle_fn)) for chrm, chrm_cov in wiggle_cov: wig_fp.write("variableStep chrom={} span=1\n".format(chrm)) wig_fp.write('\n'.join([ str(int(pos) + 1) + " " + str(int(val)) for pos, val in enumerate(chrm_cov) if val > 0 ]) + '\n') return