Exemplo n.º 1
0
def raw_worker(fast5_file_name, trim, open_pore_fraction, kmer_len, transducer, bad, min_prob,
               alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None):
    """ Worker function for basecall_network.py for basecalling from raw data

    This worker used the global variable `calc_post` which is set by
    init_worker. `calc_post` is an unpickled compiled sloika model that
    is used to calculate a posterior matrix over states

    :param open_pore_fraction: maximum allowed fraction of signal length to
        trim due to classification as open pore signal
    :param trim: (int, int) events to remove from read beginning and end
    :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post`
    :param fast5_file_name: filename for single-read fast5 file with raw data
    """
    from sloika import batch, config
    try:
        with fast5.Reader(fast5_file_name) as f5:
            signal = f5.get_read(raw=True)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format(fast5_file_name, e))
        return None

    signal = batch.trim_open_pore(signal, open_pore_fraction)
    signal = util.trim_array(signal, *trim)
    if signal.size == 0:
        sys.stderr.write("Read too short in file {}\n".format(fast5_file_name))
        return None

    inMat = (signal - np.median(signal)) / mad(signal)
    inMat = inMat[:, None, None].astype(config.sloika_dtype)
    score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet))

    return sn, score, call, inMat.shape[0]
Exemplo n.º 2
0
    def test_should_fail_when_events_section_is_missing(self, relative_file_path):
        '''Segmentation in these files is located at

        /Analyses/Segmentation_000/Summary/segmentation

        not at

        /Analyses/Segment_Linear_000/Summary/split_hairpin

        where we are looking. We initially attempted to look for it in the new place, but then discovered
        that not only the location has changed but also the structure. In particular, there doesn't appear
        to be an event index for the segmentation any more. Event index is required for get_section_events
        function to operate.'''

        filename = os.path.join(self.dataDir, relative_file_path)

        with fast5.Reader(filename) as f5:
            with self.assertRaises(ValueError) as context:
                ev = f5.get_section_events('template')

            # on precise and trusty the second part of this error message (the actual exception) is slightly
            # different from xenial:
            #
            #    KeyError("unable to open object (Symbol table: Can\\\'t open object)",)\',)'
            #    KeyError("Unable to open object (Object \\\'split_hairpin\\\' doesn\\\'t exist)",)\',)'
            #
            # so we compare only the first parts
            msg = repr(context.exception).split('\\n')[0]
            self.assertEqual(msg, 'ValueError(\'Could not retrieve template-complement split '
                                  'point data from attributes of /Analyses/Segmentation_000'
                                  '/Summary/split_hairpin')
Exemplo n.º 3
0
def events_worker(fast5_file_name, section, segmentation, trim, kmer_len, transducer,
                  bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None):
    """ Worker function for basecall_network.py for basecalling from events

    This worker used the global variable `calc_post` which is set by
    init_worker. `calc_post` is an unpickled compiled sloika model that
    is used to calculate a posterior matrix over states

    :param section: part of read to basecall, 'template' or 'complement'
    :param segmentation: location of segmentation analysis for extracting target read section
    :param trim: (int, int) events to remove from read beginning and end
    :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post`
    :param fast5_file_name: filename for single-read fast5 file with event detection and segmentation
    """
    from sloika import features
    try:
        with fast5.Reader(fast5_file_name) as f5:
            ev = f5.get_section_events(section, analysis=segmentation)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write("Error getting events for section {!r} in file {}\n{!r}\n".format(section, fast5_file_name, e))
        return None

    ev = util.trim_array(ev, *trim)
    if ev.size == 0:
        sys.stderr.write("Read too short in file {}\n".format(fast5_file_name))
        return None

    inMat = features.from_events(ev, tag='')[:, None, :]
    score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet))

    return sn, score, call, inMat.shape[0]
Exemplo n.º 4
0
def chunk_remap_worker(fn, trim, min_prob, kmer_len, prior, slip, chunk_len,
                       use_scaled, normalisation, min_length, section,
                       segmentation, references):
    try:
        with fast5.Reader(fn) as f5:
            sn = f5.filename_short
            try:
                ev = f5.get_section_events(section, analysis=segmentation)
            except ValueError:
                ev = f5.get_basecall_data(section)
    except Exception as e:
        sys.stderr.write('Failure reading events from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    try:
        read_ref = references[sn]
    except Exception as e:
        sys.stderr.write('No reference found for {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    ev = trim_ends_and_filter(ev, trim, min_length, chunk_len)
    if ev is None:
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    (score, ev, path, seq) = remap(read_ref, ev, min_prob, kmer_len, prior,
                                   slip)
    (chunks, labels, bad_ev) = chunkify(ev, chunk_len, kmer_len, use_scaled,
                                        normalisation)

    return sn + '.fast5', score, len(ev), path, seq, chunks, labels, bad_ev
Exemplo n.º 5
0
def raw_chunk_worker(fn,
                     chunk_len,
                     kmer_len,
                     min_length,
                     trim,
                     normalisation,
                     downsample_factor,
                     interpolation=False):
    """ Worker for creating labelled features from raw data

    :param fn: A filename to read from.
    :param chunk_len: Length on each chunk
    :param kmer_len: Kmer length for training
    :param min_length: Minumum number of samples before read can be considered.
    :param trim: Tuple (beginning, end) of number of samples to trim from read.
    :param normalisation: Normalisation method [per-chunk | per-read | none]
    :param downsample_factor: factor by which to downsample labels
    :param interpolation: interpolate sequence positions between those in
        mapping table
    """
    try:
        with fast5.Reader(fn) as f5:
            mapping_table, att = f5.get_any_mapping_data('template')
            sig = f5.get_read(raw=True)
            sample_rate = f5.sample_rate
            start_sample = f5.get_read(raw=True,
                                       group=True).attrs['start_time']
    except Exception as e:
        sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    mapping_table = convert_mapping_times_to_samples(mapping_table,
                                                     start_sample, sample_rate)
    map_start = mapping_table['start'][0] + trim[0]
    map_end = mapping_table['start'][-1] + mapping_table['length'][-1] - trim[1]
    mapped_signal, mapping_table = trim_signal_and_mapping(
        sig, mapping_table, map_start, map_end)

    try:
        assert mapping_table_is_registered(mapped_signal, mapping_table)
    except Exception as e:
        sys.stderr.write(
            'Failed to properly register raw signal and mapping table in {}.\n{}\n'
            .format(fn, repr(e)))
        return None

    if len(mapped_signal) < max(chunk_len, min_length):
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    new_inMat, sig_labels, sig_bad = raw_chunkify(mapped_signal, mapping_table,
                                                  chunk_len, kmer_len,
                                                  normalisation,
                                                  downsample_factor,
                                                  interpolation, att)

    return (np.ascontiguousarray(new_inMat), np.ascontiguousarray(sig_labels),
            np.ascontiguousarray(sig_bad))
Exemplo n.º 6
0
def reference_extraction_worker(file_name, section):
    with fast5.Reader(file_name) as file_handle:
        try:
            fasta = file_handle.get_reference_fasta(
                section=section).decode('utf-8')
        except Exception as e:
            sys.stderr.write('No reference found for {}.\n{}\n'.format(
                file_name, repr(e)))
            return None

        iowrapper = StringIO(fasta)
        read_ref = str(next(SeqIO.parse(iowrapper, 'fasta')).seq)
        return (file_name, read_ref)
Exemplo n.º 7
0
def raw_chunk_remap_worker(fn, trim, min_prob, kmer_len, min_length, prior,
                           slip, chunk_len, normalisation, downsample_factor,
                           interpolation, open_pore_fraction, references):
    """ Worker function for `chunkify raw_remap` remapping reads using raw signal"""
    try:
        with fast5.Reader(fn) as f5:
            signal = f5.get_read(raw=True)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write('Failure reading events from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    try:
        read_ref = references[sn]
    except Exception as e:
        sys.stderr.write('No reference found for {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    signal = batch.trim_open_pore(signal, open_pore_fraction)
    signal = util.trim_array(signal, *trim)

    if len(signal) < max(chunk_len, min_length):
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    try:
        (score, mapping_table, path, seq) = raw_remap(read_ref, signal,
                                                      min_prob, kmer_len,
                                                      prior, slip)
    except Exception as e:
        sys.stderr.write("Failure remapping read {}.\n{}\n".format(
            sn, repr(e)))
        return None
    # mapping_attrs required if using interpolation
    mapping_attrs = {
        'reference': read_ref,
        'direction': '+',
        'ref_start': 0,
    }
    (chunks, labels, bad_ev) = raw_chunkify(signal, mapping_table, chunk_len,
                                            kmer_len, normalisation,
                                            downsample_factor, interpolation,
                                            mapping_attrs)

    return sn + '.fast5', score, len(
        mapping_table), path, seq, chunks, labels, bad_ev
Exemplo n.º 8
0
def chunk_worker(fn, section, chunk_len, kmer_len, min_length, trim,
                 use_scaled, normalisation):
    """ Chunkifies data for training

    :param fn: A filename to read from
    :param section: Section of read to process (template / complement)
    :param chunk_len: Length of each chunk
    :param kmer_len: Kmer length for training
    :param min_length: Minimum number of events before read can be considered
    :param trim: Tuple (beginning, end) of number of events to trim from read
    :param use_scaled: Use prescaled event statistics
    :param normalisation: Type of normalisation to perform

    :yields: A tuple containing a 3D :class:`ndarray` of size
    (X, chunk_len, nfeatures) containing the features for the batch,
    a 2D :class:`ndarray` of size (X, chunk_len) containing the
    associated labels, and a 2D :class:`ndarray` of size (X, chunk_len)
    indicating bad events.  1 <= X <= batch_size.
    """
    # Import within worker to avoid initialising GPU in main thread
    import sloika.features

    try:
        with fast5.Reader(fn) as f5:
            ev, _ = f5.get_any_mapping_data(section)
    except Exception as e:
        sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    ev = trim_ends_and_filter(ev, trim, min_length, chunk_len)
    if ev is None:
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    return chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation)
Exemplo n.º 9
0
    def test(self, relative_file_path, analysis, number_of_events):
        filename = os.path.join(self.dataDir, relative_file_path)

        with fast5.Reader(filename) as f5:
            ev = f5.get_section_events('template', analysis=analysis)
            self.assertEqual(len(ev), number_of_events)
Exemplo n.º 10
0
    def test_filename_short(self):
        filename = os.path.join(self.dataDir, 'reads', 'read03.fast5')

        with fast5.Reader(filename) as f5:
            sn = f5.filename_short
            self.assertEqual(f5.filename_short, 'read03')
Exemplo n.º 11
0
    def test_unknown(self):
        filename = os.path.join(self.dataDir, 'reads', 'read03.fast5')

        with fast5.Reader(filename) as f5:
            ev, _ = f5.get_any_mapping_data('template')
            self.assertEqual(len(ev), 4491)
Exemplo n.º 12
0
    def test(self, relative_file_path, number_of_events, raw):
        filename = os.path.join(self.dataDir, relative_file_path)

        with fast5.Reader(filename) as f5:
            ev = f5.get_read(raw=raw)
            self.assertEqual(len(ev), number_of_events)