def test_061_write_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id) # Metadata duration and start_time should be integers, not floats with Fast5(tmp_file, 'r') as h: for key in ['duration', 'start_time']: self.assertIsInstance(h.attributes[key], int) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual( events['start'].dtype.descr[0][1], '<f8', 'Writing float data did not give float data on read.') actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual( actual, expected, 'Write float, data on read not scaled correctly, got {} not {}' .format(actual, expected)) os.unlink(tmp_file)
def test_060_construct_new_file_checks(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with self.assertRaises(IOError): fh = Fast5.New(tmp_file, 'r') fh = Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id) # This should be fine with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_read(self.tmp_events_float, self.tmp_read_id)
def test_filename_short(self): basename = 'read6' filename = os.path.join(self.dataDir, 'reads', basename + '.fast5') with Fast5(filename) as f5: sn = f5.filename_short self.assertEqual(f5.filename_short, basename)
def test_unknown(self): basename = 'read6' filename = os.path.join(self.dataDir, 'reads', basename + '.fast5') with Fast5(filename) as f5: ev, _ = f5.get_any_mapping_data('template') self.assertEqual(len(ev), 10750)
def setUp(self): self.h = Fast5(os.path.join( os.path.dirname(__file__), 'data', self.test_file )) # Use to create new temp files self.tmp_events_float = np.array( [(0.0, 1.0, 10.0, 2.0)], dtype=[(x, 'float') for x in ['start','length', 'mean', 'stdv']] ) self.tmp_events_int = np.array( [(0, 5000, 10.0, 2.0)], dtype=[ ('start', 'uint32'), ('length', 'uint32'), ('mean', 'float'), ('stdv', 'float') ] ) self.tmp_channel_id = { 'channel_number': 1, 'range': 1.0, 'digitisation': 1.0, 'offset': 0.0, 'sample_rate': 5000.0, 'sampling_rate': 5000.0 } self.tmp_read_id = { 'start_time': 0.0, 'duration': 1.0, 'read_number': 1, 'start_mux': 1, 'read_id': str(uuid4()), 'scaling_used': 1 }
def chunk_remap_worker(fn, trim, min_prob, kmer_len, prior, slip, chunk_len, use_scaled, normalisation, min_length, section, segmentation, references): try: with Fast5(fn) as f5: sn = f5.filename_short try: ev = f5.get_section_events(section, analysis=segmentation) except ValueError: ev = f5.get_basecall_data(section) except Exception as e: sys.stderr.write('Failure reading events from {}.\n{}\n'.format( fn, repr(e))) return None try: read_ref = references[sn] except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format( fn, repr(e))) return None ev = trim_ends_and_filter(ev, trim, min_length, chunk_len) if ev is None: sys.stderr.write('{} is too short.\n'.format(fn)) return None (score, ev, path, seq) = remap(read_ref, ev, min_prob, kmer_len, prior, slip) (chunks, labels, bad_ev) = chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation) return sn + '.fast5', score, len(ev), path, seq, chunks, labels, bad_ev
def test_067_write_raw_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_raw(self.tmp_raw, meta=self.tmp_read_id, read_number=1) with self.assertRaises(TypeError): with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: h.set_raw(self.tmp_raw.astype(float), meta=self.tmp_read_id, read_number=1)
def test_065_write_int_read_float_data(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id) as h: h.set_read(self.tmp_events_int, self.tmp_read_id) with Fast5(tmp_file) as h: events = h.get_read() self.assertEqual(events['start'].dtype.descr[0][1], '<f8', 'Writing uint data did not give float data on read.' ) actual = events['start'][0] expected = self.tmp_events_float['start'][0] self.assertEqual(actual, expected, 'Write unit, data on read not scaled correctly, got {} not {}'.format( actual, expected ) ) os.unlink(tmp_file)
def reference_extraction_worker(file_name, section): with Fast5(file_name) as file_handle: try: fasta = file_handle.get_reference_fasta(section=section) except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format(file_name, repr(e))) return None iowrapper = StringIO(fasta) read_ref = str(next(SeqIO.parse(iowrapper, 'fasta')).seq) return (file_name, read_ref)
def setUp(self): self.h = Fast5( os.path.join(os.path.dirname(__file__), 'data', self.test_file)) self.additional_h = Fast5( os.path.join(os.path.dirname(__file__), 'data', self.additional_file)) # Use to create new temp files self.tmp_events_float = np.array( [(0.0, 1.0, 10.0, 2.0)], dtype=[(x, 'float') for x in ['start', 'length', 'mean', 'stdv']]) self.tmp_events_int = np.array([(0, 5000, 10.0, 2.0)], dtype=[('start', 'uint32'), ('length', 'uint32'), ('mean', 'float'), ('stdv', 'float')]) self.tmp_raw = np.ones(15, dtype=np.int16) self.tmp_channel_id = { 'channel_number': 1, 'range': 1.0, 'digitisation': 1.0, 'offset': 0.0, 'sample_rate': 5000.0, 'sampling_rate': 5000.0 } self.tmp_read_id = { 'start_time': 0.0, 'duration': 1.0, 'read_number': 1, 'start_mux': 1, 'read_id': str(uuid4()), 'scaling_used': 1, 'median_before': 0 } self.tmp_tracking_id = { 'exp_start_time': '1970-01-01T00:00:00Z', 'run_id': str(uuid4()).replace('-', ''), 'flow_cell_id': 'FAH00000', }
def fetch_HMM_fn(paths): mypath_raw= paths[0] mypath_data = paths[1] for file in os.listdir(mypath_raw): # generate the three kinds of file name strings fast5_fn = os.fsdecode(file) result_chiron_fn = mypath_data+'/result/'+fast5_fn[:-5]+'fastq' # the base sequence raw_fn = mypath_raw +'/'+fast5_fn # raw reads/inputs file # generate three data objects iterator for record in SeqIO.parse(result_chiron_fn,'fastq'): base_seq = record.seq # only one sequence per file raw_fn = Fast5(raw_fn) raw = raw_fn.get_read(raw=True) yield fast5_fn,base_seq, raw
def events_worker(fast5_file_name, section, segmentation, trim, kmer_len, transducer, bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None): """ Worker function for basecall_network.py for basecalling from events This worker used the global variable `calc_post` which is set by init_worker. `calc_post` is an unpickled compiled sloika model that is used to calculate a posterior matrix over states :param section: part of read to basecall, 'template' or 'complement' :param segmentation: location of segmentation analysis for extracting target read section :param trim: (int, int) events to remove from read beginning and end :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post` :param fast5_file_name: filename for single-read fast5 file with event detection and segmentation """ from sloika import features try: with Fast5(fast5_file_name) as f5: ev = f5.get_section_events(section, analysis=segmentation) sn = f5.filename_short except Exception as e: sys.stderr.write( "Error getting events for section {!r} in file {}\n{!r}\n".format( section, fast5_file_name, e)) return None ev = util.trim_array(ev, *trim) if ev.size == 0: sys.stderr.write("Read too short in file {}\n".format(fast5_file_name)) return None inMat = features.from_events(ev, tag='')[:, None, :] score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet)) return sn, score, call, inMat.shape[0]
def raw_worker(fast5_file_name, trim, open_pore_fraction, kmer_len, transducer, bad, min_prob, alphabet=DEFAULT_ALPHABET, skip=5.0, trans=None): """ Worker function for basecall_network.py for basecalling from raw data This worker used the global variable `calc_post` which is set by init_worker. `calc_post` is an unpickled compiled sloika model that is used to calculate a posterior matrix over states :param open_pore_fraction: maximum allowed fraction of signal length to trim due to classification as open pore signal :param trim: (int, int) events to remove from read beginning and end :param kmer_len, min_prob, transducer, bad, trans, skip: see `decode_post` :param fast5_file_name: filename for single-read fast5 file with raw data """ from sloika import batch, config try: with Fast5(fast5_file_name) as f5: signal = f5.get_read(raw=True) sn = f5.filename_short except Exception as e: sys.stderr.write("Error getting raw data for file {}\n{!r}\n".format( fast5_file_name, e)) return None signal = batch.trim_open_pore(signal, open_pore_fraction) signal = util.trim_array(signal, *trim) if signal.size == 0: sys.stderr.write("Read too short in file {}\n".format(fast5_file_name)) return None inMat = (signal - np.median(signal)) / mad(signal) inMat = inMat[:, None, None].astype(config.sloika_dtype) score, call = decode_post(calc_post(inMat), kmer_len, transducer, bad, min_prob, skip, trans, nbase=len(alphabet)) return sn, score, call, inMat.shape[0]
def setUpClass(self): """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data""" print('* Fast5 Basecaller and Mapper') self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA' self.qstring = '!'*len(self.seq) self.model_file = 'example_template.model' self.events_file = 'example_template.events' self.model_file = 'example_template.model' self.bc_scale_file = 'example_template.bc_scale' self.bc_path_file = 'example_template.bc_path' self.map_scale_file = 'example_template.map_scale' self.map_path_file = 'example_template.map_path' self.map_post_file = 'example_template.map_post' self.ref_name = 'test_seq' # Open new file header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate'] channel_id = {x:0 for x in header} tracking_id = tracking_id = { 'exp_start_time': '1970-01-00T00:00:00Z', 'run_id': 'a'*32, 'flow_cell_id': 'FAH00000', } fakefile = tempfile.NamedTemporaryFile() self.fh = Fast5.New(fakefile.name, channel_id=channel_id, tracking_id=tracking_id, read='a') # load data to set within fast5 file self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True) self.model['kmer'] = self.model['kmer'].astype(str) self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True) # use namedtuple to imitate a Scale object Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']) bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t')) bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t') self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq) map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t')) map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t') map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t') n_states = len(self.seq) - len(self.model['kmer'][0]) + 1 self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name) self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)
def raw_chunk_worker(fn, chunk_len, kmer_len, min_length, trim, normalisation, downsample_factor, interpolation=False): """ Worker for creating labelled features from raw data :param fn: A filename to read from. :param chunk_len: Length on each chunk :param kmer_len: Kmer length for training :param min_length: Minumum number of samples before read can be considered. :param trim: Tuple (beginning, end) of number of samples to trim from read. :param normalisation: Normalisation method [per-chunk | per-read | none] :param downsample_factor: factor by which to downsample labels :param interpolation: interpolate sequence positions between those in mapping table """ try: with Fast5(fn) as f5: mapping_table, att = f5.get_any_mapping_data('template') sig = f5.get_read(raw=True) sample_rate = f5.sample_rate start_sample = f5.get_read(raw=True, group=True).attrs['start_time'] except Exception as e: sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format(fn, repr(e))) return None mapping_table = convert_mapping_times_to_samples(mapping_table, start_sample, sample_rate) map_start = mapping_table['start'][0] + trim[0] map_end = mapping_table['start'][-1] + mapping_table['length'][-1] - trim[1] mapped_signal, mapping_table = trim_signal_and_mapping(sig, mapping_table, map_start, map_end) try: assert mapping_table_is_registered(mapped_signal, mapping_table) except Exception as e: sys.stderr.write('Failed to properly register raw signal and mapping table in {}.\n{}\n'.format(fn, repr(e))) return None if len(mapped_signal) < max(chunk_len, min_length): sys.stderr.write('{} is too short.\n'.format(fn)) return None new_inMat, sig_labels, sig_bad = raw_chunkify(mapped_signal, mapping_table, chunk_len, kmer_len, normalisation, downsample_factor, interpolation, att) return (np.ascontiguousarray(new_inMat), np.ascontiguousarray(sig_labels), np.ascontiguousarray(sig_bad))
def create_fast5(raw_data, fast5_filename): raw_data = np.array(raw_data) # create fast5 (from https://nanoporetech.github.io/fast5_research/examples.html) # example of how to digitize data start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) rng = stop - start digitisation = 8192.0 bins = np.arange(start, stop, rng / digitisation) # np.int16 is required, the library will refuse to write anything other raw_data_binned = np.digitize(raw_data, bins).astype(np.int16) # The following are required meta data channel_id = { 'digitisation': digitisation, 'offset': 0, 'range': rng, 'sampling_rate': 4000, 'channel_number': 1, } read_id = { 'start_time': 0, 'duration': len(raw_data), 'read_number': 1, 'start_mux': 1, 'read_id': str(uuid4()), 'scaling_used': 1, 'median_before': 0, } tracking_id = { 'exp_start_time': '1970-01-01T00:00:00Z', 'run_id': str(uuid4()).replace('-', ''), 'flow_cell_id': 'FAH00000', } context_tags = {} with Fast5.New(fast5_filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_data_binned, meta=read_id, read_number=1)
def raw_chunk_remap_worker(fn, trim, min_prob, kmer_len, min_length, prior, slip, chunk_len, normalisation, downsample_factor, interpolation, open_pore_fraction, references): """ Worker function for `chunkify raw_remap` remapping reads using raw signal""" try: with Fast5(fn) as f5: signal = f5.get_read(raw=True) sn = f5.filename_short except Exception as e: sys.stderr.write('Failure reading events from {}.\n{}\n'.format(fn, repr(e))) return None try: read_ref = references[sn] except Exception as e: sys.stderr.write('No reference found for {}.\n{}\n'.format(fn, repr(e))) return None signal = batch.trim_open_pore(signal, open_pore_fraction) signal = util.trim_array(signal, *trim) if len(signal) < max(chunk_len, min_length): sys.stderr.write('{} is too short.\n'.format(fn)) return None try: (score, mapping_table, path, seq) = raw_remap(read_ref, signal, min_prob, kmer_len, prior, slip) except Exception as e: sys.stderr.write("Failure remapping read {}.\n{}\n".format(sn, repr(e))) return None # mapping_attrs required if using interpolation mapping_attrs = { 'reference': read_ref, 'direction': '+', 'ref_start': 0, } (chunks, labels, bad_ev) = raw_chunkify(signal, mapping_table, chunk_len, kmer_len, normalisation, downsample_factor, interpolation, mapping_attrs) return sn + '.fast5', score, len(mapping_table), path, seq, chunks, labels, bad_ev
def digitize_write(raw_data, read_id, params): digitisation = 8192.0 start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) rng = stop - start bins = np.arange(start, stop, rng / digitisation) # np.int16 is required, the library will refuse to write anything other #raw_data = np.digitize(raw_data, bins).astype(np.int16) raw_data = np.round(raw_data) raw_data = raw_data.astype(np.int16) filename = params.fast5_path + read_id + '.fast5' # The following are required meta data channel_id = { 'digitisation': digitisation, 'offset': 0, 'range': rng, 'sampling_rate': 4000, 'channel_number': 1, } read_id = { 'start_time': 0, 'duration': len(raw_data), 'read_number': 1, 'start_mux': 1, 'read_id': str(read_id), 'scaling_used': 1, 'median_before': 0, } tracking_id = { 'exp_start_time': '1970-01-01T00:00:00Z', 'run_id': str(uuid4()).replace('-', ''), 'flow_cell_id': 'FAH00000', } context_tags = {} with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_data, meta=read_id, read_number=1)
def chunk_worker(fn, section, chunk_len, kmer_len, min_length, trim, use_scaled, normalisation): """ Chunkifies data for training :param fn: A filename to read from :param section: Section of read to process (template / complement) :param chunk_len: Length of each chunk :param kmer_len: Kmer length for training :param min_length: Minimum number of events before read can be considered :param trim: Tuple (beginning, end) of number of events to trim from read :param use_scaled: Use prescaled event statistics :param normalisation: Type of normalisation to perform :yields: A tuple containing a 3D :class:`ndarray` of size (X, chunk_len, nfeatures) containing the features for the batch, a 2D :class:`ndarray` of size (X, chunk_len) containing the associated labels, and a 2D :class:`ndarray` of size (X, chunk_len) indicating bad events. 1 <= X <= batch_size. """ # Import within worker to avoid initialising GPU in main thread import sloika.features try: with Fast5(fn) as f5: ev, _ = f5.get_any_mapping_data(section) except Exception as e: sys.stderr.write('Failed to get mapping data from {}.\n{}\n'.format( fn, repr(e))) return None ev = trim_ends_and_filter(ev, trim, min_length, chunk_len) if ev is None: sys.stderr.write('{} is too short.\n'.format(fn)) return None return chunkify(ev, chunk_len, kmer_len, use_scaled, normalisation)
def test(self, relative_file_path, number_of_events, raw): filename = os.path.join(self.dataDir, relative_file_path) with Fast5(filename) as f5: ev = f5.get_read(raw=raw) self.assertEqual(len(ev), number_of_events)
def test(self, relative_file_path, analysis, number_of_events): filename = os.path.join(self.dataDir, relative_file_path) with Fast5(filename) as f5: ev = f5.get_section_events('template', analysis=analysis) self.assertEqual(len(ev), number_of_events)
yield fast5_fn,base_seq, raw def fetch_classif_fn(paths) mypath_raw= paths[0] mypath_data = paths[1] for file in os.listdir(mypath_raw): # generate the three kinds of file name strings fast5_fn = os.fsdecode(file) result_chiron_fn = mypath_data+'/result/'+fast5_fn[:-5]+'fastq' # the base sequence base_seqidx_fn = mypath_data+'/'+fast5_fn[:-5] + 'signalsegidx.txt' # the seq_idx file for interaction raw_fn = mypath_raw +'/'+fast5_fn # raw reads/inputs file # generate three data objects iterator for record in SeqIO.parse(result_chiron_fn,'fastq'): base_seq = record.seq # only one sequence per file f_seqidx = open(base_seqidx_fn,'r') raw_fn = Fast5(raw_fn) raw = raw_fn.get_read(raw=True) yield fast5_fn,base_seq, f_seqidx, raw def classif_data(paths): if PARAMS.write_file: input_fn = 'data-training/input_data.txt' output_fn = 'data-training/output_data.txt' myfile_input = open(input_fn,"w") myfile_output = open(output_fn,"w") base_k_list = list() raw_k_list = list() fn_iter = fetch_classif_fn(paths) for fn,base_seq, f_seqidx, raw in fn_iter: