def test_004_med_mad(self): x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.0, 0.5, 0.5, 1.0]]) factor = 1 loc, scale = maths.med_mad(x, factor=factor) self.assertTrue(np.allclose(loc, 0.5)) self.assertTrue(np.allclose(scale, 0))
def sample_filter_parameters(read_data, number_to_sample, chunk_len, filter_mean_dwell, filter_max_dwell, chunk_len_means_sequence_len=False): """Sample number_to_sample reads from read_data, calculate median and MAD of mean dwell. Note the MAD has an adjustment factor so that it would give the same result as the std for a normal distribution. See docstring for sample_chunks() for the parameters. """ no_filter_params = FILTER_PARAMETERS(filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell, median_meandwell=None, mad_meandwell=None) chunks, _ = sample_chunks( read_data, number_to_sample, chunk_len, no_filter_params, chunk_len_means_sequence_len=chunk_len_means_sequence_len) meandwells = [get_mean_dwell(chunk) for chunk in chunks] median_meandwell, mad_meandwell = med_mad(meandwells) return FILTER_PARAMETERS(filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell, median_meandwell=median_meandwell, mad_meandwell=mad_meandwell)
def sample_filter_parameters(read_data, number_to_sample, chunk_len, filter_mean_dwell, filter_max_dwell, filter_min_pass_fraction, model_stride, path_buffer, chunk_len_means_sequence_len=False): """ Sample number_to_sample reads from read_data, calculate median and MAD of mean dwell. Note the MAD has an adjustment factor so that it would give the same result as the std for a normal distribution. See FILTER_PARAMETERS docstring for details. """ no_filter_params = FILTER_PARAMETERS( filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell, filter_min_pass_fraction=filter_min_pass_fraction, median_meandwell=None, mad_meandwell=None, model_stride=None, path_buffer=None) chunks, _ = sample_chunks( read_data, number_to_sample, chunk_len, no_filter_params, chunk_len_means_sequence_len=chunk_len_means_sequence_len) meandwells = [chunk.mean_dwell for chunk in chunks] median_meandwell, mad_meandwell = med_mad(meandwells) return FILTER_PARAMETERS( filter_mean_dwell=filter_mean_dwell, filter_max_dwell=filter_max_dwell, filter_min_pass_fraction=filter_min_pass_fraction, median_meandwell=median_meandwell, mad_meandwell=mad_meandwell, model_stride=model_stride, path_buffer=path_buffer)
def one_read_shift_scale(read_tuple): read_filename, read_id = read_tuple try: with fast5_interface.get_fast5_file(read_filename, 'r') as f5file: read = f5file.get_read(read_id) sig = Signal(read) except Exception as e: sys.stderr.write( 'Unable to obtain signal for {} from {}.\n{}\n'.format( read_id, read_filename, repr(e))) return (None, None, None) else: signal = sig.current if len(signal) > 0: shift, scale = med_mad(signal) else: shift, scale = np.NaN, np.NaN # note - if signal trimmed by ub, it could be of length zero by this point for short reads # These are taken out later in the existing code, in the new code we'll take out ub trimming return (read_id, shift, scale)
def test_004_med_mad(self): """Test to see if med_mad works with axis not set (so flattening).""" x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.0, 0.5, 0.5, 1.0]]) factor = 1 loc, scale = maths.med_mad(x, factor=factor) self.assertTrue(np.allclose(loc, 0.5)) self.assertTrue(np.allclose(scale, 0))
def test_006_med_mad_over_axis1(self): x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.0, 0.5, 0.5, 1.0]]) factor = 1 loc, scale = maths.med_mad(x, factor=factor, axis=1) expected_loc = [0.5, 0.75, 0.5] expected_scale = [0, 0.25, 0.25] self.assertTrue(np.allclose(loc, expected_loc)) self.assertTrue(np.allclose(scale, expected_scale))
def test_005_med_mad_over_axis0(self): """Test to see if med_mad works when axis=0.""" x = np.array([[0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 1.0, 1.0], [0.5, 1.0, 0.5, 1.0]]) factor = 1 loc, scale = maths.med_mad(x, factor=factor, axis=0) expected_loc = [0.5, 0.5, 0.5, 1.0] expected_scale = [0, 0, 0, 0] self.assertTrue(np.allclose(loc, expected_loc)) self.assertTrue(np.allclose(scale, expected_scale))
def sample_filter_parameters(read_data, number_to_sample, chunk_len, args, log=None, chunk_log=None, chunk_len_means_sequence_len = False): """Sample number_to_sample reads from read_data, calculate median and MAD of mean dwell. Note the MAD has an adjustment factor so that it would give the same result as the std for a normal distribution. See docstring for sample_chunks() for the parameters. """ meandwells, _ = sample_chunks(read_data, number_to_sample, chunk_len, args, get_mean_dwell, log=log, chunk_log=chunk_log, log_accepted_chunks=True, chunk_len_means_sequence_len=chunk_len_means_sequence_len) return med_mad(meandwells)
def med_mad_norm(x, dtype='f4'): """ Normalise a numpy array using median and MAD Args: x (:class:`ndarray`): 1D array containing values to be normalised. dtype (str or :class:`dtype`): dtype of returned array. Returns: :class:`ndarray`: Array of same shape as `x` and dtype `dtype` contained normalised values. """ med, mad = med_mad(x) normed_x = (x - med) / mad return normed_x.astype(dtype)
def one_read_shift_scale(read_tuple): """ Read signal from fast5 and perform medmad scaling Args: read_tuple (tuple of str and str): A filename and the read_id to read from it. Returns: tuple of str and float and float: read_id of the read and the calculated shift and scale parameters. If the signal is unable to be read from the file, the read_id is not present for example, then (None, , None, None) is returned. When a signal is read, but has zero length, the shift and scale returned are `np.NaN` """ read_filename, read_id = read_tuple try: with fast5_interface.get_fast5_file(read_filename, 'r') as f5file: read = f5file.get_read(read_id) sig = Signal(read) except Exception as e: sys.stderr.write( 'Unable to obtain signal for {} from {}.\n{}\n'.format( read_id, read_filename, repr(e))) return (None, None, None) else: signal = sig.current if len(signal) > 0: shift, scale = med_mad(signal) else: shift, scale = np.NaN, np.NaN # Note - if signal trimmed by ub, it could be of length zero by # this point for short reads # These are taken out later in the existing code, in the new code # we'll take out ub trimming return (read_id, shift, scale)
def med_mad_norm(x, dtype='f4'): """ Normalise a numpy array using median and MAD """ med, mad = med_mad(x) normed_x = (x - med) / mad return normed_x.astype(dtype)
def main(): args = get_parser().parse_args() out_fp = sys.stdout if args.output is None else open(args.output, 'w') sys.stderr.write('* Reading data from file\n') out_fp.write('*' * 10 + ' General Metrics ' + '*' * 10 + '\n') with MappedSignalReader(args.input) as msr: alphabet_info = msr.get_alphabet_information() out_fp.write('Alphabet: {}\n'.format(str(alphabet_info))) read_ids = msr.get_read_ids() out_fp.write('Total reads: {}\n'.format(len(read_ids))) if args.num_reads is not None: np.random.shuffle(read_ids) read_ids = read_ids[:args.num_reads] reads = list(msr.reads(read_ids)) sys.stderr.write('* Computing sanity check metrics\n') out_fp.write('\n\n' + '*' * 10 + ' Sanity Checks ' + '*' * 10 + '\n') current_meds = np.array([np.median(read.get_current()) for read in reads]) out_fp.write(( 'Median of medians of normalized signal: {:.6f} (should usually ' + 'be close to 0)\n').format(np.median(current_meds))) current_mads = np.array([ np.median(np.abs(read.get_current() - r_med)) * MAD_SD_FACTOR for read, r_med in zip(reads, current_meds)]) out_fp.write(( 'Median of standardized MADs of normalized signal: {:.6f} ' + '(should usually be close to 1.0)\n').format(np.median(current_mads))) all_seqs = np.concatenate([read.Reference for read in reads]) seq_counts = np.bincount(all_seqs) total_bases = np.sum(seq_counts) out_fp.write( 'Global sequence composition:\n{: >11}{: >11}{: >11}\n'.format( 'base', 'count', 'percent') + '\n'.join( '{: >11}{:11.0f}{:11.4f}'.format(*base_metrics) for base_metrics in zip( alphabet_info.alphabet, seq_counts, 100 * seq_counts / total_bases)) + '\n') if args.motif is not None: sys.stderr.write('* Computing motif metrics (this may take a ' + 'while to complete)\n') out_fp.write('\n\n' + '*' * 10 + ' Motif Metrics ' + '*' * 10 + '\n') def match_motif(seq, motif): for base, motif_pos_bases in zip(seq, motif): if base not in motif_pos_bases: return False return True motifs = [ ([np.concatenate([np.where([m_base == a_base for a_base in alphabet_info.collapse_alphabet])[0] for m_base in SINGLE_LETTER_CODE[m_raw_base]]) for m_raw_base in motif], int(rel_pos)) for motif, rel_pos in args.motif] motif_mod_counts = [[] for _ in range(len(motifs))] for read in reads: for motif_i, (motif, rel_pos) in enumerate(motifs): for offset in range(read.Reference.shape[0] - len(motif)): if match_motif( read.Reference[offset:offset + len(motif)], motif): motif_mod_counts[motif_i].append( read.Reference[offset + rel_pos]) for (raw_motif, rel_pos), m_mod_counts in zip( args.motif, motif_mod_counts): rel_pos = int(rel_pos) motif_mod_counts = np.bincount(m_mod_counts) total_mod_bases = np.sum(motif_mod_counts) out_fp.write( ('{} Motif Modified Base Counts:\n{: >11}{: >11}' + '{: >11}\n').format(raw_motif, 'base', 'count', 'percent') + '\n'.join('{: >11}{:11.0f}{:11.4f}'.format( raw_motif[:rel_pos] + base + raw_motif[rel_pos + 1:], count, pct) for base, count, pct in zip( alphabet_info.alphabet, motif_mod_counts, 100 * motif_mod_counts / total_mod_bases) if count > 0) + '\n\n') sys.stderr.write('* Computing read metrics\n') out_fp.write('\n' + '*' * 10 + ' Read Metrics ' + '*' * 10 + '\n') read_lens = np.array([read.reflen for read in reads]) out_fp.write(('Median read length: {}\n').format(np.median(read_lens))) sig_lens = np.array([read.siglen for read in reads]) out_fp.write(('Median signal length: {}\n').format( np.median(sig_lens))) if args.num_chunks is None: return sys.stderr.write('* Computing chunk metrics\n') out_fp.write('\n\n' + '*' * 10 + ' Chunk Metrics ' + '*' * 10 + '\n') chunks, rej_res = [], [] while len(chunks) < args.num_chunks: chunk = np.random.choice(reads, 1)[0].get_chunk_with_sample_length( args.chunk_len) if chunk.accepted: chunks.append(chunk) else: rej_res.append(chunk.reject_reason) if len(rej_res) > 0: out_fp.write( 'Chunk rejection reasons:\n{: >16}{: >16}\n'.format( 'Reject Reason', 'Num. Chunks') + '\n'.join('{: >16}{: >16}'.format(*x) for x in Counter(rej_res).most_common()) + '\n\n') else: out_fp.write('All chunks passed filters\n\n') mean_dwells = np.array([chunk.mean_dwell for chunk in chunks]) max_dwells = np.array([chunk.max_dwell for chunk in chunks]) # report in MAD units for direct use in command line parameter usage median_meandwell, mad_meandwell = med_mad(mean_dwells) out_fp.write( ('Chunk dwell distribution (standard units for direct use with ' + '--filter_max_dwell and --filter_mean_dwell):\n' + '{: >15}{: >15}{: >15}{: >15}{: >15}\n').format( 'percentile', 'mean_dwell', 'mean_std_units', 'max_dwell', 'max_std_units') + '\n'.join('{:15.2f}{:15.2f}{:15.2f}{:15.2f}{:15.2f}'.format( *pctl_metrics) for pctl_metrics in zip( args.chunk_percentiles, np.percentile(mean_dwells, args.chunk_percentiles), np.percentile(mean_dwells - median_meandwell / mad_meandwell, args.chunk_percentiles), np.percentile(max_dwells, args.chunk_percentiles), np.percentile(max_dwells / median_meandwell, args.chunk_percentiles))) + '\n') if args.output is not None: out_fp.close()