예제 #1
0
    def test_061_write_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file,
                       'a',
                       channel_id=self.tmp_channel_id,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)

        # Metadata duration and start_time should be integers, not floats
        with Fast5(tmp_file, 'r') as h:
            for key in ['duration', 'start_time']:
                self.assertIsInstance(h.attributes[key], int)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(
                events['start'].dtype.descr[0][1], '<f8',
                'Writing float data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(
                actual, expected,
                'Write float, data on read not scaled correctly, got {} not {}'
                .format(actual, expected))

        os.unlink(tmp_file)
예제 #2
0
    def test_060_construct_new_file_checks(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with self.assertRaises(IOError):
            fh = Fast5.New(tmp_file, 'r')
            fh = Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id)
            fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id)

        # This should be fine
        with Fast5.New(tmp_file,
                       'a',
                       channel_id=self.tmp_channel_id,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)
예제 #3
0
    def test_filename_short(self):
        basename = 'read6'
        filename = os.path.join(self.dataDir, 'reads', basename + '.fast5')

        with Fast5(filename) as f5:
            sn = f5.filename_short
            self.assertEqual(f5.filename_short, basename)
예제 #4
0
    def test_unknown(self):
        basename = 'read6'
        filename = os.path.join(self.dataDir, 'reads', basename + '.fast5')

        with Fast5(filename) as f5:
            ev, _ = f5.get_any_mapping_data('template')
            self.assertEqual(len(ev), 10750)
예제 #5
0
    def setUp(self):
        self.h = Fast5(os.path.join(
            os.path.dirname(__file__), 'data', self.test_file
        ))

        # Use to create new temp files
        self.tmp_events_float = np.array(
            [(0.0, 1.0, 10.0, 2.0)],
            dtype=[(x, 'float') for x in ['start','length', 'mean', 'stdv']]
        )
        self.tmp_events_int = np.array(
            [(0, 5000, 10.0, 2.0)],
            dtype=[
                ('start', 'uint32'), ('length', 'uint32'),
                ('mean', 'float'), ('stdv', 'float')
            ]
        )
        self.tmp_channel_id = {
            'channel_number': 1,
            'range': 1.0,
            'digitisation': 1.0,
            'offset': 0.0,
            'sample_rate': 5000.0,
            'sampling_rate': 5000.0
        }
        self.tmp_read_id = {
            'start_time': 0.0,
            'duration': 1.0,
            'read_number': 1,
            'start_mux': 1,
            'read_id': str(uuid4()),
            'scaling_used': 1
        }
예제 #6
0
def chunk_remap_worker(fn, trim, min_prob, kmer_len, prior, slip, chunk_len,
                       use_scaled, normalisation, min_length, section,
                       segmentation, references):
    try:
        with Fast5(fn) as f5:
            sn = f5.filename_short
            try:
                ev = f5.get_section_events(section, analysis=segmentation)
            except ValueError:
                ev = f5.get_basecall_data(section)
    except Exception as e:
        sys.stderr.write('Failure reading events from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    try:
        read_ref = references[sn]
    except Exception as e:
        sys.stderr.write('No reference found for {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    ev = trim_ends_and_filter(ev, trim, min_length, chunk_len)
    if ev is None:
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    (score, ev, path, seq) = remap(read_ref, ev, min_prob, kmer_len, prior,
                                   slip)
    (chunks, labels, bad_ev) = chunkify(ev, chunk_len, kmer_len, use_scaled,
                                        normalisation)

    return sn + '.fast5', score, len(ev), path, seq, chunks, labels, bad_ev
예제 #7
0
    def test_067_write_raw_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
        with Fast5.New(tmp_file,
                       'a',
                       channel_id=self.tmp_channel_id,
                       tracking_id=self.tmp_tracking_id) as h:
            h.set_raw(self.tmp_raw, meta=self.tmp_read_id, read_number=1)

        with self.assertRaises(TypeError):
            with Fast5.New(tmp_file,
                           'a',
                           channel_id=self.tmp_channel_id,
                           tracking_id=self.tmp_tracking_id) as h:
                h.set_raw(self.tmp_raw.astype(float),
                          meta=self.tmp_read_id,
                          read_number=1)
예제 #8
0
    def test_065_write_int_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_int, self.tmp_read_id)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(events['start'].dtype.descr[0][1], '<f8',
                'Writing uint data did not give float data on read.'
            )
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(actual, expected,
                'Write unit, data on read not scaled correctly, got {} not {}'.format(
                    actual, expected
                )
            )

        os.unlink(tmp_file)
예제 #9
0
def reference_extraction_worker(file_name, section):
    with Fast5(file_name) as file_handle:
        try:
            fasta = file_handle.get_reference_fasta(section=section)
        except Exception as e:
            sys.stderr.write('No reference found for {}.\n{}\n'.format(file_name, repr(e)))
            return None

        iowrapper = StringIO(fasta)
        read_ref = str(next(SeqIO.parse(iowrapper, 'fasta')).seq)
        return (file_name, read_ref)
예제 #10
0
    def setUp(self):
        self.h = Fast5(
            os.path.join(os.path.dirname(__file__), 'data', self.test_file))

        self.additional_h = Fast5(
            os.path.join(os.path.dirname(__file__), 'data',
                         self.additional_file))

        # Use to create new temp files
        self.tmp_events_float = np.array(
            [(0.0, 1.0, 10.0, 2.0)],
            dtype=[(x, 'float') for x in ['start', 'length', 'mean', 'stdv']])
        self.tmp_events_int = np.array([(0, 5000, 10.0, 2.0)],
                                       dtype=[('start', 'uint32'),
                                              ('length', 'uint32'),
                                              ('mean', 'float'),
                                              ('stdv', 'float')])
        self.tmp_raw = np.ones(15, dtype=np.int16)

        self.tmp_channel_id = {
            'channel_number': 1,
            'range': 1.0,
            'digitisation': 1.0,
            'offset': 0.0,
            'sample_rate': 5000.0,
            'sampling_rate': 5000.0
        }
        self.tmp_read_id = {
            'start_time': 0.0,
            'duration': 1.0,
            'read_number': 1,
            'start_mux': 1,
            'read_id': str(uuid4()),
            'scaling_used': 1,
            'median_before': 0
        }
        self.tmp_tracking_id = {
            'exp_start_time': '1970-01-01T00:00:00Z',
            'run_id': str(uuid4()).replace('-', ''),
            'flow_cell_id': 'FAH00000',
        }
예제 #11
0
def fetch_HMM_fn(paths):
  mypath_raw= paths[0]
  mypath_data = paths[1]
  for file in os.listdir(mypath_raw):
    # generate the three kinds of file name strings
    fast5_fn = os.fsdecode(file)
    result_chiron_fn = mypath_data+'/result/'+fast5_fn[:-5]+'fastq' # the base sequence
    raw_fn = mypath_raw +'/'+fast5_fn # raw reads/inputs file
    # generate three data objects iterator
    for record in SeqIO.parse(result_chiron_fn,'fastq'):
      base_seq = record.seq # only one sequence per file
    raw_fn = Fast5(raw_fn)
    raw = raw_fn.get_read(raw=True)
    yield fast5_fn,base_seq, raw
예제 #12
0
파일: basecall.py 프로젝트: rrwick/sloika
def events_worker(fast5_file_name,
                  section,
                  segmentation,
                  trim,
                  kmer_len,
                  transducer,
                  bad,
                  min_prob,
                  alphabet=DEFAULT_ALPHABET,
                  skip=5.0,
                  trans=None):
    """ Worker function for basecall_network.py for basecalling from events

    This worker used the global variable `calc_post` which is set by
    init_worker. `calc_post` is an unpickled compiled sloika model that
    is used to calculate a posterior matrix over states

    :param section: part of read to basecall, 'template' or 'complement'
    :param segmentation: location of segmentation analysis for extracting target read section
    :param trim: (int, int) events to remove from read beginning and end
    :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post`
    :param fast5_file_name: filename for single-read fast5 file with event detection and segmentation
    """
    from sloika import features
    try:
        with Fast5(fast5_file_name) as f5:
            ev = f5.get_section_events(section, analysis=segmentation)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write(
            "Error getting events for section {!r} in file {}\n{!r}\n".format(
                section, fast5_file_name, e))
        return None

    ev = util.trim_array(ev, *trim)
    if ev.size == 0:
        sys.stderr.write("Read too short in file {}\n".format(fast5_file_name))
        return None

    inMat = features.from_events(ev, tag='')[:, None, :]
    score, call = decode_post(calc_post(inMat),
                              kmer_len,
                              transducer,
                              bad,
                              min_prob,
                              skip,
                              trans,
                              nbase=len(alphabet))

    return sn, score, call, inMat.shape[0]
예제 #13
0
파일: basecall.py 프로젝트: rrwick/sloika
def raw_worker(fast5_file_name,
               trim,
               open_pore_fraction,
               kmer_len,
               transducer,
               bad,
               min_prob,
               alphabet=DEFAULT_ALPHABET,
               skip=5.0,
               trans=None):
    """ Worker function for basecall_network.py for basecalling from raw data

    This worker used the global variable `calc_post` which is set by
    init_worker. `calc_post` is an unpickled compiled sloika model that
    is used to calculate a posterior matrix over states

    :param open_pore_fraction: maximum allowed fraction of signal length to
        trim due to classification as open pore signal
    :param trim: (int, int) events to remove from read beginning and end
    :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post`
    :param fast5_file_name: filename for single-read fast5 file with raw data
    """
    from sloika import batch, config
    try:
        with Fast5(fast5_file_name) as f5:
            signal = f5.get_read(raw=True)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format(
            fast5_file_name, e))
        return None

    signal = batch.trim_open_pore(signal, open_pore_fraction)
    signal = util.trim_array(signal, *trim)
    if signal.size == 0:
        sys.stderr.write("Read too short in file {}\n".format(fast5_file_name))
        return None

    inMat = (signal - np.median(signal)) / mad(signal)
    inMat = inMat[:, None, None].astype(config.sloika_dtype)
    score, call = decode_post(calc_post(inMat),
                              kmer_len,
                              transducer,
                              bad,
                              min_prob,
                              skip,
                              trans,
                              nbase=len(alphabet))

    return sn, score, call, inMat.shape[0]
예제 #14
0
    def setUpClass(self):
        """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data"""
        print('* Fast5 Basecaller and Mapper')

        self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA'
        self.qstring = '!'*len(self.seq)
        self.model_file = 'example_template.model'
        self.events_file = 'example_template.events'
        self.model_file = 'example_template.model'
        self.bc_scale_file = 'example_template.bc_scale'
        self.bc_path_file = 'example_template.bc_path'
        self.map_scale_file = 'example_template.map_scale'
        self.map_path_file = 'example_template.map_path'
        self.map_post_file = 'example_template.map_post'
        self.ref_name = 'test_seq'

        # Open new file
        header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate']
        channel_id = {x:0 for x in header}
        tracking_id = tracking_id = {
            'exp_start_time': '1970-01-00T00:00:00Z',
            'run_id': 'a'*32,
            'flow_cell_id': 'FAH00000',
        }
        fakefile = tempfile.NamedTemporaryFile()
        self.fh = Fast5.New(fakefile.name, channel_id=channel_id, tracking_id=tracking_id, read='a')

        # load data to set within fast5 file
        self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True)

        self.model['kmer'] = self.model['kmer'].astype(str)

        self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True)

        # use namedtuple to imitate a Scale object
        Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'])

        bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t'))
        bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t')

        self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq)

        map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t'))
        map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t')
        map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t')

        n_states = len(self.seq) - len(self.model['kmer'][0]) + 1
        self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name)
        self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)
예제 #15
0
def raw_chunk_worker(fn, chunk_len, kmer_len, min_length, trim, normalisation,
                     downsample_factor, interpolation=False):
    """ Worker for creating labelled features from raw data

    :param fn: A filename to read from.
    :param chunk_len: Length on each chunk
    :param kmer_len: Kmer length for training
    :param min_length: Minumum number of samples before read can be considered.
    :param trim: Tuple (beginning, end) of number of samples to trim from read.
    :param normalisation: Normalisation method [per-chunk | per-read | none]
    :param downsample_factor: factor by which to downsample labels
    :param interpolation: interpolate sequence positions between those in
        mapping table
    """
    try:
        with Fast5(fn) as f5:
            mapping_table, att = f5.get_any_mapping_data('template')
            sig = f5.get_read(raw=True)
            sample_rate = f5.sample_rate
            start_sample = f5.get_read(raw=True, group=True).attrs['start_time']
    except Exception as e:
        sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format(fn, repr(e)))
        return None

    mapping_table = convert_mapping_times_to_samples(mapping_table, start_sample, sample_rate)
    map_start = mapping_table['start'][0] + trim[0]
    map_end = mapping_table['start'][-1] + mapping_table['length'][-1] - trim[1]
    mapped_signal, mapping_table = trim_signal_and_mapping(sig, mapping_table, map_start, map_end)

    try:
        assert mapping_table_is_registered(mapped_signal, mapping_table)
    except Exception as e:
        sys.stderr.write('Failed to properly register raw signal and mapping table in {}.\n{}\n'.format(fn, repr(e)))
        return None

    if len(mapped_signal) < max(chunk_len, min_length):
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    new_inMat, sig_labels, sig_bad = raw_chunkify(mapped_signal, mapping_table, chunk_len, kmer_len, normalisation,
                                                  downsample_factor, interpolation, att)

    return (np.ascontiguousarray(new_inMat),
            np.ascontiguousarray(sig_labels),
            np.ascontiguousarray(sig_bad))
예제 #16
0
def create_fast5(raw_data, fast5_filename):
    raw_data = np.array(raw_data)
    # create fast5 (from https://nanoporetech.github.io/fast5_research/examples.html)
    # example of how to digitize data
    start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
    rng = stop - start
    digitisation = 8192.0
    bins = np.arange(start, stop, rng / digitisation)
    # np.int16 is required, the library will refuse to write anything other
    raw_data_binned = np.digitize(raw_data, bins).astype(np.int16)

    # The following are required meta data
    channel_id = {
        'digitisation': digitisation,
        'offset': 0,
        'range': rng,
        'sampling_rate': 4000,
        'channel_number': 1,
    }
    read_id = {
        'start_time': 0,
        'duration': len(raw_data),
        'read_number': 1,
        'start_mux': 1,
        'read_id': str(uuid4()),
        'scaling_used': 1,
        'median_before': 0,
    }
    tracking_id = {
        'exp_start_time': '1970-01-01T00:00:00Z',
        'run_id': str(uuid4()).replace('-', ''),
        'flow_cell_id': 'FAH00000',
    }
    context_tags = {}

    with Fast5.New(fast5_filename,
                   'w',
                   tracking_id=tracking_id,
                   context_tags=context_tags,
                   channel_id=channel_id) as h:
        h.set_raw(raw_data_binned, meta=read_id, read_number=1)
예제 #17
0
def raw_chunk_remap_worker(fn, trim, min_prob, kmer_len, min_length,
                           prior, slip, chunk_len, normalisation, downsample_factor,
                           interpolation, open_pore_fraction, references):
    """ Worker function for `chunkify raw_remap` remapping reads using raw signal"""
    try:
        with Fast5(fn) as f5:
            signal = f5.get_read(raw=True)
            sn = f5.filename_short
    except Exception as e:
        sys.stderr.write('Failure reading events from {}.\n{}\n'.format(fn, repr(e)))
        return None

    try:
        read_ref = references[sn]
    except Exception as e:
        sys.stderr.write('No reference found for {}.\n{}\n'.format(fn, repr(e)))
        return None

    signal = batch.trim_open_pore(signal, open_pore_fraction)
    signal = util.trim_array(signal, *trim)

    if len(signal) < max(chunk_len, min_length):
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    try:
        (score, mapping_table, path, seq) = raw_remap(read_ref, signal, min_prob, kmer_len, prior, slip)
    except Exception as e:
        sys.stderr.write("Failure remapping read {}.\n{}\n".format(sn, repr(e)))
        return None
    # mapping_attrs required if using interpolation
    mapping_attrs = {
        'reference': read_ref,
        'direction': '+',
        'ref_start': 0,
    }
    (chunks, labels, bad_ev) = raw_chunkify(signal, mapping_table, chunk_len, kmer_len, normalisation,
                                            downsample_factor, interpolation, mapping_attrs)

    return sn + '.fast5', score, len(mapping_table), path, seq, chunks, labels, bad_ev
예제 #18
0
def digitize_write(raw_data, read_id, params):
    digitisation = 8192.0
    start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
    rng = stop - start
    bins = np.arange(start, stop, rng / digitisation)
    # np.int16 is required, the library will refuse to write anything other
    #raw_data = np.digitize(raw_data, bins).astype(np.int16)
    raw_data = np.round(raw_data)
    raw_data = raw_data.astype(np.int16)
    filename = params.fast5_path + read_id + '.fast5'
    # The following are required meta data
    channel_id = {
        'digitisation': digitisation,
        'offset': 0,
        'range': rng,
        'sampling_rate': 4000,
        'channel_number': 1,
    }
    read_id = {
        'start_time': 0,
        'duration': len(raw_data),
        'read_number': 1,
        'start_mux': 1,
        'read_id': str(read_id),
        'scaling_used': 1,
        'median_before': 0,
    }
    tracking_id = {
        'exp_start_time': '1970-01-01T00:00:00Z',
        'run_id': str(uuid4()).replace('-', ''),
        'flow_cell_id': 'FAH00000',
    }
    context_tags = {}
    with Fast5.New(filename,
                   'w',
                   tracking_id=tracking_id,
                   context_tags=context_tags,
                   channel_id=channel_id) as h:
        h.set_raw(raw_data, meta=read_id, read_number=1)
예제 #19
0
def chunk_worker(fn, section, chunk_len, kmer_len, min_length, trim,
                 use_scaled, normalisation):
    """ Chunkifies data for training

    :param fn: A filename to read from
    :param section: Section of read to process (template / complement)
    :param chunk_len: Length of each chunk
    :param kmer_len: Kmer length for training
    :param min_length: Minimum number of events before read can be considered
    :param trim: Tuple (beginning, end) of number of events to trim from read
    :param use_scaled: Use prescaled event statistics
    :param normalisation: Type of normalisation to perform

    :yields: A tuple containing a 3D :class:`ndarray` of size
    (X, chunk_len, nfeatures) containing the features for the batch,
    a 2D :class:`ndarray` of size (X, chunk_len) containing the
    associated labels, and a 2D :class:`ndarray` of size (X, chunk_len)
    indicating bad events.  1 <= X <= batch_size.
    """
    # Import within worker to avoid initialising GPU in main thread
    import sloika.features

    try:
        with Fast5(fn) as f5:
            ev, _ = f5.get_any_mapping_data(section)
    except Exception as e:
        sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format(
            fn, repr(e)))
        return None

    ev = trim_ends_and_filter(ev, trim, min_length, chunk_len)
    if ev is None:
        sys.stderr.write('{} is too short.\n'.format(fn))
        return None

    return chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation)
예제 #20
0
    def test(self, relative_file_path, number_of_events, raw):
        filename = os.path.join(self.dataDir, relative_file_path)

        with Fast5(filename) as f5:
            ev = f5.get_read(raw=raw)
            self.assertEqual(len(ev), number_of_events)
예제 #21
0
    def test(self, relative_file_path, analysis, number_of_events):
        filename = os.path.join(self.dataDir, relative_file_path)

        with Fast5(filename) as f5:
            ev = f5.get_section_events('template', analysis=analysis)
            self.assertEqual(len(ev), number_of_events)
예제 #22
0
    yield fast5_fn,base_seq, raw

def fetch_classif_fn(paths)
  mypath_raw= paths[0]
  mypath_data = paths[1]
  for file in os.listdir(mypath_raw):
    # generate the three kinds of file name strings
    fast5_fn = os.fsdecode(file)
    result_chiron_fn = mypath_data+'/result/'+fast5_fn[:-5]+'fastq' # the base sequence
    base_seqidx_fn = mypath_data+'/'+fast5_fn[:-5] + 'signalsegidx.txt' # the seq_idx file for interaction
    raw_fn = mypath_raw +'/'+fast5_fn # raw reads/inputs file
    # generate three data objects iterator
    for record in SeqIO.parse(result_chiron_fn,'fastq'):
      base_seq = record.seq # only one sequence per file
    f_seqidx = open(base_seqidx_fn,'r')
    raw_fn = Fast5(raw_fn)
    raw = raw_fn.get_read(raw=True)
    yield fast5_fn,base_seq, f_seqidx, raw



def classif_data(paths):
  if PARAMS.write_file:
    input_fn = 'data-training/input_data.txt'
    output_fn = 'data-training/output_data.txt'
    myfile_input = open(input_fn,"w")
    myfile_output = open(output_fn,"w")
  base_k_list = list()
  raw_k_list = list()
  fn_iter = fetch_classif_fn(paths)
  for fn,base_seq, f_seqidx, raw in fn_iter: