def raw_chunkify(signal, mapping_table, chunk_len, kmer_len, normalisation, downsample_factor, interpolation, mapping_attrs=None): """ Generate labelled data chunks from raw signal and mapping table """ assert len(signal) >= chunk_len assert normalisation in AVAILABLE_NORMALISATIONS assert mapping_table_is_registered(signal, mapping_table) ml = len(signal) // chunk_len ub = ml * chunk_len signal, mapping_table = trim_signal_and_mapping(signal, mapping_table, 0, ub) assert mapping_table_is_registered(signal, mapping_table) new_inMat = signal.reshape((ml, chunk_len, 1)) if normalisation == "per-chunk": chunk_medians = np.median(new_inMat, axis=1, keepdims=True) chunk_mads = mad(new_inMat, axis=1, keepdims=True) new_inMat = (new_inMat - chunk_medians) / chunk_mads elif normalisation == "per-read": new_inMat = (new_inMat - np.median(new_inMat)) / mad(new_inMat) else: assert normalisation == "none" if interpolation: block_midpoints = np.arange(0, ub, downsample_factor) pos = interpolate_pos(mapping_table, mapping_attrs)(block_midpoints, kmer_len) sig_labels = interpolate_labels(mapping_table, mapping_attrs)(block_midpoints, kmer_len) sig_labels[np.ediff1d(pos, to_begin=1) == 0] = 0 sig_labels = sig_labels.reshape((ml, -1)) else: all_labels = labels_from_mapping_table(mapping_table['kmer'], kmer_len) labels = all_labels[mapping_table['move'] > 0] all_starts = mapping_table['start'][index_of_previous_non_zero( mapping_table['move'])] starts = all_starts[mapping_table['move'] > 0] idx = np.zeros(ub, dtype=np.int) idx[starts] = np.arange(len(labels)) + 1 idx = fill_zeros_with_prev(idx) idx = idx.reshape((ml, chunk_len))[:, ::downsample_factor] idx = np.apply_along_axis(replace_repeats_with_zero, 1, idx) sig_labels = np.concatenate([[0], labels])[idx].astype('i4') # Bad state isn't supported yet with raw models sig_bad = np.zeros((ml, chunk_len), dtype=bool) return new_inMat, sig_labels, sig_bad
def test_007_mad_keepdims(self): x = np.zeros((5, 6, 7)) self.assertTrue( np.allclose(maths.mad(x, axis=0, keepdims=True), np.zeros( (1, 6, 7)))) self.assertTrue( np.allclose(maths.mad(x, axis=1, keepdims=True), np.zeros( (5, 1, 7)))) self.assertTrue( np.allclose(maths.mad(x, axis=2, keepdims=True), np.zeros( (5, 6, 1))))
def raw_worker(fast5_file_name, trim, open_pore_fraction, kmer_len, transducer, bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None): """ Worker function for basecall_network.py for basecalling from raw data This worker used the global variable `calc_post` which is set by init_worker. `calc_post` is an unpickled compiled sloika model that is used to calculate a posterior matrix over states :param open_pore_fraction: maximum allowed fraction of signal length to trim due to classification as open pore signal :param trim: (int, int) events to remove from read beginning and end :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post` :param fast5_file_name: filename for single-read fast5 file with raw data """ from sloika import batch, config try: with fast5.Reader(fast5_file_name) as f5: signal = f5.get_read(raw=True) sn = f5.filename_short except Exception as e: sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format(fast5_file_name, e)) return None signal = batch.trim_open_pore(signal, open_pore_fraction) signal = util.trim_array(signal, *trim) if signal.size == 0: sys.stderr.write("Read too short in file {}\n".format(fast5_file_name)) return None inMat = (signal - np.median(signal)) / mad(signal) inMat = inMat[:, None, None].astype(config.sloika_dtype) score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet)) return sn, score, call, inMat.shape[0]
def trim_open_pore(signal, max_op_fraction=0.3, var_method='mad', window_size=100): """Locate raw read in signal by thresholding local variance :param signal: raw data containing a read :param max_op_fraction: (float) Maximum expected fraction of signal that consists of open pore. Higher values will find smaller reads at the cost of slightly truncating longer reads. :param var_method: ('std' | 'mad') method used to compute the local variation. std: standard deviation, mad: Median Absolute Deviation :param window_size: size of patches used to estimate local variance """ assert var_method in TRIM_OPEN_PORE_LOCAL_VAR_METHODS, "var_method not understood: {}".format( var_method) ml = len(signal) // window_size ub = ml * window_size if var_method == 'std': local_var = signal[:ub].reshape((ml, window_size)).std(1) if var_method == 'mad': sig_chunks = signal[:ub].reshape((ml, window_size)) local_var = maths.mad(sig_chunks, axis=1) probably_read = (local_var > np.percentile(local_var, 100 * max_op_fraction)) ix = np.arange(local_var.shape[0])[probably_read] start = ix.min() * window_size end = (ix.max() + 1) * window_size return signal[start:end]
def raw_remap(ref, signal, min_prob, kmer_len, prior, slip): """ Map raw signal to reference sequence using transducer model""" from sloika import config # local import to avoid CUDA init in main thread inMat = (signal - np.median(signal)) / mad(signal) inMat = inMat[:, None, None].astype(config.sloika_dtype) post = sloika.decode.prepare_post(batch.calc_post(inMat), min_prob=min_prob, drop_bad=False) kmers = np.array(bio.seq_to_kmers(ref, kmer_len)) seq = [batch.kmer_to_state[k] + 1 for k in kmers] prior0 = None if prior[0] is None else sloika.util.geometric_prior( len(seq), prior[0]) prior1 = None if prior[1] is None else sloika.util.geometric_prior( len(seq), prior[1], rev=True) score, path = sloika.transducer.map_to_sequence(post, seq, slip=slip, prior_initial=prior0, prior_final=prior1, log=False) mapping_dtype = [ ('start', '<i8'), ('length', '<i8'), ('seq_pos', '<i8'), ('move', '<i8'), ('kmer', 'S{}'.format(kmer_len)), ('good_emission', '?'), ] mapping_table = np.zeros(post.shape[0], dtype=mapping_dtype) stride = int(np.ceil(signal.shape[0] / float(post.shape[0]))) mapping_table['start'] = np.arange( 0, signal.shape[0], stride, dtype=np.int) - stride // 2 mapping_table['length'] = stride mapping_table['seq_pos'] = path mapping_table['move'] = np.ediff1d(path, to_begin=1) mapping_table['kmer'] = kmers[path] # We set 'good_emission' for compatability only mapping_table['good_emission'] = True _, mapping_table = trim_signal_and_mapping(signal, mapping_table, 0, len(signal)) return (score, mapping_table, path, seq)
def get_read_stats(self): """Combines stats based on events with output of .summary, assumes a one read file. """ data = deepcopy(self.summary()) read = self.get_read() n_events = len(read) q = np.percentile(read['mean'], [10, 50, 90]) data['range_current'] = q[2] - q[0] data['median_current'] = q[1] data['num_events'] = n_events data['median_sd'] = np.median(read['stdv']) data['median_dwell'] = np.median(read['length']) data['sd_current'] = np.std(read['mean']) data['mad_current'] = mad(read['mean']) data['eps'] = data['num_events'] / data['strand_duration'] return data