예제 #1
0
def raw_chunkify(signal,
                 mapping_table,
                 chunk_len,
                 kmer_len,
                 normalisation,
                 downsample_factor,
                 interpolation,
                 mapping_attrs=None):
    """ Generate labelled data chunks from raw signal and mapping table
    """
    assert len(signal) >= chunk_len
    assert normalisation in AVAILABLE_NORMALISATIONS
    assert mapping_table_is_registered(signal, mapping_table)

    ml = len(signal) // chunk_len
    ub = ml * chunk_len
    signal, mapping_table = trim_signal_and_mapping(signal, mapping_table, 0,
                                                    ub)
    assert mapping_table_is_registered(signal, mapping_table)
    new_inMat = signal.reshape((ml, chunk_len, 1))

    if normalisation == "per-chunk":
        chunk_medians = np.median(new_inMat, axis=1, keepdims=True)
        chunk_mads = mad(new_inMat, axis=1, keepdims=True)
        new_inMat = (new_inMat - chunk_medians) / chunk_mads
    elif normalisation == "per-read":
        new_inMat = (new_inMat - np.median(new_inMat)) / mad(new_inMat)
    else:
        assert normalisation == "none"

    if interpolation:
        block_midpoints = np.arange(0, ub, downsample_factor)
        pos = interpolate_pos(mapping_table, mapping_attrs)(block_midpoints,
                                                            kmer_len)
        sig_labels = interpolate_labels(mapping_table,
                                        mapping_attrs)(block_midpoints,
                                                       kmer_len)
        sig_labels[np.ediff1d(pos, to_begin=1) == 0] = 0
        sig_labels = sig_labels.reshape((ml, -1))
    else:
        all_labels = labels_from_mapping_table(mapping_table['kmer'], kmer_len)
        labels = all_labels[mapping_table['move'] > 0]
        all_starts = mapping_table['start'][index_of_previous_non_zero(
            mapping_table['move'])]
        starts = all_starts[mapping_table['move'] > 0]

        idx = np.zeros(ub, dtype=np.int)
        idx[starts] = np.arange(len(labels)) + 1
        idx = fill_zeros_with_prev(idx)
        idx = idx.reshape((ml, chunk_len))[:, ::downsample_factor]
        idx = np.apply_along_axis(replace_repeats_with_zero, 1, idx)

        sig_labels = np.concatenate([[0], labels])[idx].astype('i4')

    # Bad state isn't supported yet with raw models
    sig_bad = np.zeros((ml, chunk_len), dtype=bool)

    return new_inMat, sig_labels, sig_bad
예제 #2
0
 def test_007_mad_keepdims(self):
     x = np.zeros((5, 6, 7))
     self.assertTrue(
         np.allclose(maths.mad(x, axis=0, keepdims=True), np.zeros(
             (1, 6, 7))))
     self.assertTrue(
         np.allclose(maths.mad(x, axis=1, keepdims=True), np.zeros(
             (5, 1, 7))))
     self.assertTrue(
         np.allclose(maths.mad(x, axis=2, keepdims=True), np.zeros(
             (5, 6, 1))))
예제 #3
0
def raw_worker(fast5_file_name, trim, open_pore_fraction, kmer_len, transducer, bad, min_prob,
               alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None):
    """ Worker function for basecall_network.py for basecalling from raw data

    This worker used the global variable `calc_post` which is set by
    init_worker. `calc_post` is an unpickled compiled sloika model that
    is used to calculate a posterior matrix over states

    :param open_pore_fraction: maximum allowed fraction of signal length to
        trim due to classification as open pore signal
    :param trim: (int, int) events to remove from read beginning and end
    :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post`
    :param fast5_file_name: filename for single-read fast5 file with raw data
    """
    from sloika import batch, config
    try:
        with fast5.Reader(fast5_file_name) as f5:
            signal = f5.get_read(raw=True)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format(fast5_file_name, e))
        return None

    signal = batch.trim_open_pore(signal, open_pore_fraction)
    signal = util.trim_array(signal, *trim)
    if signal.size == 0:
        sys.stderr.write("Read too short in file {}\n".format(fast5_file_name))
        return None

    inMat = (signal - np.median(signal)) / mad(signal)
    inMat = inMat[:, None, None].astype(config.sloika_dtype)
    score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet))

    return sn, score, call, inMat.shape[0]
예제 #4
0
def trim_open_pore(signal,
                   max_op_fraction=0.3,
                   var_method='mad',
                   window_size=100):
    """Locate raw read in signal by thresholding local variance

    :param signal: raw data containing a read
    :param max_op_fraction: (float) Maximum expected fraction of signal that
        consists of open pore. Higher values will find smaller reads at the
        cost of slightly truncating longer reads.
    :param var_method: ('std' | 'mad') method used to compute the local
        variation. std: standard deviation, mad: Median Absolute Deviation
    :param window_size: size of patches used to estimate local variance
    """
    assert var_method in TRIM_OPEN_PORE_LOCAL_VAR_METHODS, "var_method not understood: {}".format(
        var_method)

    ml = len(signal) // window_size
    ub = ml * window_size

    if var_method == 'std':
        local_var = signal[:ub].reshape((ml, window_size)).std(1)
    if var_method == 'mad':
        sig_chunks = signal[:ub].reshape((ml, window_size))
        local_var = maths.mad(sig_chunks, axis=1)

    probably_read = (local_var > np.percentile(local_var,
                                               100 * max_op_fraction))
    ix = np.arange(local_var.shape[0])[probably_read]
    start = ix.min() * window_size
    end = (ix.max() + 1) * window_size
    return signal[start:end]
예제 #5
0
def raw_remap(ref, signal, min_prob, kmer_len, prior, slip):
    """ Map raw signal to reference sequence using transducer model"""
    from sloika import config  # local import to avoid CUDA init in main thread

    inMat = (signal - np.median(signal)) / mad(signal)
    inMat = inMat[:, None, None].astype(config.sloika_dtype)
    post = sloika.decode.prepare_post(batch.calc_post(inMat),
                                      min_prob=min_prob,
                                      drop_bad=False)

    kmers = np.array(bio.seq_to_kmers(ref, kmer_len))
    seq = [batch.kmer_to_state[k] + 1 for k in kmers]
    prior0 = None if prior[0] is None else sloika.util.geometric_prior(
        len(seq), prior[0])
    prior1 = None if prior[1] is None else sloika.util.geometric_prior(
        len(seq), prior[1], rev=True)

    score, path = sloika.transducer.map_to_sequence(post,
                                                    seq,
                                                    slip=slip,
                                                    prior_initial=prior0,
                                                    prior_final=prior1,
                                                    log=False)

    mapping_dtype = [
        ('start', '<i8'),
        ('length', '<i8'),
        ('seq_pos', '<i8'),
        ('move', '<i8'),
        ('kmer', 'S{}'.format(kmer_len)),
        ('good_emission', '?'),
    ]
    mapping_table = np.zeros(post.shape[0], dtype=mapping_dtype)
    stride = int(np.ceil(signal.shape[0] / float(post.shape[0])))
    mapping_table['start'] = np.arange(
        0, signal.shape[0], stride, dtype=np.int) - stride // 2
    mapping_table['length'] = stride
    mapping_table['seq_pos'] = path
    mapping_table['move'] = np.ediff1d(path, to_begin=1)
    mapping_table['kmer'] = kmers[path]
    # We set 'good_emission' for compatability only
    mapping_table['good_emission'] = True

    _, mapping_table = trim_signal_and_mapping(signal, mapping_table, 0,
                                               len(signal))

    return (score, mapping_table, path, seq)
예제 #6
0
    def get_read_stats(self):
        """Combines stats based on events with output of .summary, assumes a
        one read file.

        """
        data = deepcopy(self.summary())
        read = self.get_read()
        n_events = len(read)
        q = np.percentile(read['mean'], [10, 50, 90])
        data['range_current'] = q[2] - q[0]
        data['median_current'] = q[1]
        data['num_events'] = n_events
        data['median_sd'] = np.median(read['stdv'])
        data['median_dwell'] = np.median(read['length'])
        data['sd_current'] = np.std(read['mean'])
        data['mad_current'] = mad(read['mean'])
        data['eps'] = data['num_events'] / data['strand_duration']
        return data