def __init__(self, fast5, channel, *args, good_class='strand', time_warp=1, **kwargs): """An RPC service for replaying a channel from a .fast5 file. :param fast5: input filename. :param channel: channel to simulate. :param good_class: read classification name of desirable reads. :param time_warp: time multiplier for playback speed. ..note:: `args` and `kwargs` are passed to `aiozmq.rpc.AttrHandler`. """ super().__init__(*args, **kwargs) self.fast5 = fast5 self.channel = channel self.good_class = good_class self.time_warp = time_warp self.logger = logging.getLogger('Replay Channel {}'.format( channel.zfill(4))) with BulkFast5(self.fast5) as fh: self.sample_rate = fh.sample_rate self.time_base = 1.0 / self.sample_rate self.reads = [x for x in fh.get_reads(self.channel)] self.read_starts = np.ascontiguousarray( [x['read_start'] for x in self.reads]) self.reset_time()
def get_events(self, n_events=400): """Return events from the start of the current read. :param nevents: maximum number of events to return. :returns: Serialized event data, see :class:`Fast5Data`. """ self.logger.debug("Request for events at {}".format( self.current_sample)) read = self.reads[self.current_read] if read['classification'] != self.good_class: return None else: start = int(read['event_index_start']) end = min(self.current_event, start + n_events) self.logger.debug( "Fetching events [{}, {}] for read {} starting at {}. Current sample is {}." .format( start, end, self.current_read, int(self.sample_offset + self.reads[self.current_read]['read_start']), self.current_sample)) with BulkFast5(self.fast5) as fh: events = fh.get_events(self.channel, event_indices=[start, end]) return Fast5Data( events, info=str(read['read_id']), start=int(self.sample_offset + read['read_start']), end=int(self.sample_offset + read['read_start'] + read['read_length']))
def get_raw(self, seconds=1, delay=0.5): """Return events from the start of the current read. :param nevents: maximum number of events to return :returns: Serialized raw data, see :class:`Fast5Data`. """ self.logger.debug("Request for raw at {}".format(self.current_sample)) read = self.reads[self.current_read] if read['classification'] != self.good_class: return None else: start = int(read['read_start'] + self.sample_rate * delay) end = int( min(self.current_sample, start + self.sample_rate * seconds)) if end <= start: return None self.logger.debug( "Fetching raw [{}, {}] for read {} starting at {}. Current sample is {}." .format( start, end, self.current_read, int(self.sample_offset + self.reads[self.current_read]['read_start']), self.current_sample)) with BulkFast5(self.fast5) as fh: raw = fh.get_raw(self.channel, raw_indices=[start, end]) return Fast5Data( raw, info=str(read['read_id']), start=int(self.sample_offset + read['read_start']), end=int(self.sample_offset + read['read_start'] + read['read_length']))
def reads(self): """Yield `Reads` obtained from delta splitting.""" with BulkFast5(self.fast5) as fh: # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(fh) events = fh.get_events(self.channel) bounds = read_bounds_from_delta(events['mean'], delta=self.delta, look_back_n=self.look_back) for event_indices in bounds: read_events = None if self.with_events: read_events = events[event_indices[0]:event_indices[1]] read_events = self._convert_event_fields( read_events, fh.sample_rate) meta = { 'start_time': read_events[0]['start'], 'duration': read_events[-1]['start'] + read_events[-1]['length'] - read_events[0]['start'], 'num_events': event_indices[1] - event_indices[0], 'start_event': event_indices[0], 'end_event': event_indices[1], } self._add_channel_states(fh, meta) if set(meta.keys()) != set(self.meta_keys): extra = set(meta.keys()) - set(self.meta_keys) missing = set(self.meta_keys) - set(meta.keys()) raise ValueError( '{} about to yield read with unexpected metrics. ' 'Extra: {}. Missing {}.'.format( self.__class__, extra, missing)) read_raw = None if self.with_raw: read_raw = fh.get_raw( self.channel, times=(meta['start_time'], meta['start_time'] + meta['duration']), use_scaling=False) if meta['start_time'] > self.max_time: raise StopIteration yield Read(events=read_events, raw=read_raw, meta=meta, channel_meta=self.channel_meta, context_meta=self.context_meta, tracking_meta=self.tracking_meta)
def reads(self): with BulkFast5(self.fast5) as fh: # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(fh) for chunk in iterators.blocker(fh.get_events(self.channel), self.interval): yield Read(events=chunk)
def current_event(self): """Index of the current event.""" prev = self._current_event with BulkFast5(self.fast5) as fh: event_path = fh.__event_data__.format(self.channel) new = np.searchsorted(fh[event_path][prev:]['start'], self.current_sample) self._current_event = int(new + prev) return self._current_event
def _get_levels(self, outpath, prefix): """Calculate distribution of event means, and infer open-pore level and capture level. Assumes the pore level corresoponds to the highest-probability peak in the distribution, and that the capture level is the second highest. :param outpath: directory in which to plot the distribution and levels. :param prefix: prefix (prefixed to output plot path) :returns: tuple of floats, (pore_level, capture level) """ with BulkFast5(self.fast5) as fh: events = fh.get_events(self.channel) kde = gaussian_kde( events['mean'], bw_method='silverman' ) # silverman is seemingly better for multi-modal dists x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100) pde_vals = kde(x) # evaluate density over grid max_inds = argrelmax(kde(x)) # find all local maxima max_probs = pde_vals[max_inds] sorted_inds = np.argsort(max_probs) max_ind = max_inds[0][ sorted_inds[-1]] # index of maxima in x and max_probs second_max_ind = max_inds[0][sorted_inds[-2]] pore_level = x[max_ind] capture_level = x[second_max_ind] # plot kde, histogram and levels. fig, axis = plt.subplots() axis.hist(events['mean'], bins=100, color='k', label='histogram') axis.legend(loc='upper center', frameon=False) axis2 = axis.twinx() axis2.plot(x, kde(x), label='kde', color='k') axis2.plot(x[max_inds], pde_vals[max_inds], 'o', label='local maxima', color='b') axis2.plot(x[max_ind], pde_vals[max_ind], 'o', label='open pore current', color='r') axis2.plot(x[second_max_ind], pde_vals[second_max_ind], 'o', label='capture current', color='g') axis2.legend(loc='upper left', frameon=False) plot_path = os.path.join(outpath, add_prefix('AdaptiveThresholdLevels', prefix)) plt.savefig(plot_path, bbox_inches='tight', dpi=200) return pore_level, capture_level
def reads(self): """Yield `Reads` with various meta data provided by MinKnow.""" if self.with_events or self.with_raw or self.with_states: f5 = BulkFast5(self.fast5) # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(f5) else: # initialise fast5/channel meta variables so we have a generic call # to Read constructor even when we don't have the fast5 self.channel_meta = None self.context_meta = None self.tracking_meta = None if self.with_events: if set(['start_event', 'end_event']).issubset(self.meta_keys): get_events = lambda meta: f5.get_events( self.channel, event_indices=(meta['start_event'], meta['end_event'])) else: logger.warn('Reading events using timings, this will be slow.') get_events = lambda meta: f5.get_events( self.channel, times=(meta['start_time'], meta['start_time'] + meta[ 'duration'])) for meta in self.iterate_input_file(): read_events = None if self.with_events: read_events = get_events(meta) read_events = self._convert_event_fields( read_events, f5.sample_rate) read_raw = None if self.with_raw: read_raw = f5.get_raw( self.channel, times=(meta['start_time'], meta['start_time'] + meta['duration']), use_scaling=False) if self.with_states: # add mux, channel states from fast5 self._add_channel_states(f5, meta) if meta['start_time'] > self.max_time: raise StopIteration yield Read(events=read_events, raw=read_raw, meta=meta, channel_meta=self.channel_meta, context_meta=self.context_meta, tracking_meta=self.tracking_meta) if self.with_events or self.with_raw or self.with_states: f5.close()
def extract_channel_reads(source, output, prefix, flat, by_id, channel): if flat: out_path = output else: out_path = os.path.join(output, str(channel)) os.makedirs(out_path) with BulkFast5(source) as src: raw_data = src.get_raw(channel, use_scaling=False) meta = src.get_metadata(channel) tracking_id = src.get_tracking_meta() context_tags = src.get_context_meta() channel_id = { 'channel_number': channel, 'range': meta['range'], 'digitisation': meta['digitisation'], 'offset': meta['offset'], 'sample_rate': meta['sample_rate'], 'sampling_rate': meta['sample_rate'] } median_before = None counter = 1 for read_number, read in enumerate(src.get_reads(channel)): if median_before is None: median_before = read['median'] continue if read['classification'] != 'strand': median_before = read['median'] else: counter += 1 start, length = read['read_start'], read['read_length'] read_id = { 'start_time': read['read_start'], 'duration': read['read_length'], 'read_number': read_number, 'start_mux': src.get_mux(channel, raw_index=start, wells_only=True), 'read_id': read['read_id'], 'scaling_used': 1, 'median_before': median_before } raw_slice = raw_data[start:start+length] if by_id: filename = '{}.fast5'.format(read['read_id']) else: filename = '{}_read_ch{}_file{}.fast5'.format( prefix, channel, read_number ) filename = os.path.join(out_path, filename) with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: h.set_raw(raw_slice, meta=read_id, read_number=read_number) return counter, channel
def extract_read_summary(): logging.basicConfig( format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO ) logger = logging.getLogger('Summarize Reads') parser = argparse.ArgumentParser(description='Summarize reads stored in a Bulk .fast5') parser.add_argument('input', help='Bulk .fast5 file for input.') parser.add_argument('output', help='Output text file.') parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).') args = parser.parse_args() if args.channel_range is None: with BulkFast5(args.input) as src: channels = src.channels else: channels = range(args.channel_range[0], args.channel_range[1] + 1) with BulkFast5(args.input) as src, open(args.output, 'w') as out_fh: extract_read_summary_internal(src, channels, out_fh, logger)
def main(args=None): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) if args is None: args = sys.argv[1:] # process args and get yaml string of all options args, yaml_conf_out = process_args(args) logging.debug('args are: {}'.format(args)) logger.info("Will stop after {} seconds of expt time.".format( args['max_time'])) # Multiple components will write here if 'outpath' in args and args['outpath'] is not None: os.mkdir(args['outpath']) # save the config if args['config_out'] is not None: path = add_prefix(args['config_out'], args['prefix']) if 'outpath' in args and args['outpath'] is not None: path = os.path.join(args['outpath'], path) with open(path, 'w') as fh: fh.write(yaml_conf_out) # Get channel range from bulk if it was not specified if args['channels'] is None: with BulkFast5(args['fast5'], 'r') as f5: args['channels'] = list(f5.channels) else: args['channels'] = list(args['channels']) # Test pipeline can be constructed read_generator, metric_calculator, classifier, second_stage = get_pipeline( args, args['channels'][0]) logger.info('Splitter : {}.'.format(read_generator)) logger.info('Metrifier : {}.'.format(metric_calculator)) logger.info('Classifier : {}.'.format(classifier)) logger.info('SecondStage : {}.'.format(second_stage)) # Accumulators gather results from individual channels accumulators = get_accumulators(args) logger.info('Accumulators : {}.'.format(accumulators)) for read_metrics in accumulate_channels(args): for accumulator in accumulators: accumulator.process_read(read_metrics) # Finish up accumulators for accumulator in accumulators: accumulator.finalize()
def reads(self): with BulkFast5(self.fast5) as fh: # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(fh) events = fh.get_events(self.channel) # convert event fields 'start' and 'length' from raw indices into times for col in ['start', 'length']: times = events[col] / fh.sample_rate events = drop_fields(events, col, usemask=False) events = append_fields(events, col, times, usemask=False) read_bound_event_indices = np.where( np.ediff1d(( events['mean'] < self.threshold).astype(int)) != 0)[0] # first event should be start of first read read_bound_event_indices = np.insert(read_bound_event_indices + 1, 0, 0) # pad end with last event index + 1. read_bound_event_indices = np.append(read_bound_event_indices, len(events) - 1) for start_event, next_start_event in iterators.window( read_bound_event_indices, 2): start_t, end_t = events[start_event]['start'], events[ next_start_event]['start'] meta = { 'start_time': start_t, 'duration': end_t - start_t, 'pore_level': self.pore_level, 'capture_level': self.capture_level, 'threshold': self.threshold, } self._add_channel_states(fh, meta) read_raw = None if self.with_raw: read_raw = fh.get_raw(self.channel, times=(start_t, end_t), use_scaling=False) if meta['start_time'] > self.max_time: raise StopIteration yield Read(events=read_events, raw=read_raw, meta=meta, channel_meta=self.channel_meta, context_meta=self.context_meta, tracking_meta=self.tracking_meta)
def extract_single_reads(): logging.basicConfig( format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO ) logger = logging.getLogger('Extract Reads') parser = argparse.ArgumentParser(description='Bulk .fast5 to single read .fast5 conversion.') parser.add_argument('input', help='Bulk .fast5 file for input.') parser.add_argument('output', help='Output folder.') parser.add_argument('--flat', default=False, action='store_true', help='Create all .fast5 files in one directory') parser.add_argument('--by_id', default=False, action='store_true', help='Name single-read .fast5 files by read_id.') parser.add_argument('--prefix', default='read', help='Read file prefix.') parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).') parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.') args = parser.parse_args() if not os.path.exists(args.output): os.makedirs(args.output) else: raise IOError('The output directory must not exist.') worker = functools.partial( extract_channel_reads, args.input, args.output, args.prefix, args.flat, args.by_id, ) if args.channel_range is None: with BulkFast5(args.input) as src: channels = src.channels else: channels = range(args.channel_range[0], args.channel_range[1] + 1) if args.workers > 1: with ProcessPoolExecutor(args.workers) as executor: futures = [executor.submit(worker, c) for c in channels] for future in as_completed(futures): try: n_reads, channel = future.result() except Exception: logger.warning("Error processing channel.") else: logger.info("Extracted {} reads from channel {}.".format(n_reads, channel)) else: for channel in channels: worker(channel) logger.info("Finished.")
def _process_reads(self, reads, metrics): if self.with_events or self.with_raw or self.with_states: f5 = BulkFast5(self.fast5) # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(f5) reads_queue = [] for read, meta in itertools.izip(reads, metrics): # TODO: at the moment we don't use the read objects as its simpler # just to load what we want from the fast5 but we could imagine # combining the events, raw, state_changes and mux_changes from # individual reads into a new read object, so we don't need a fast5 logger.debug('Read: channel {} mux {} time {} class {}'.format( meta['channel'], meta['mux'], meta['start_time'], meta['class'])) # if we have have accumulated any reads, yield them if the # well_id has changed. Note that unblock_voltage_1 and # common_voltage_1 both enumerate to 1, so this will not stop us # joining up blocks interrupted by flicks. if len(reads_queue ) > 0 and not meta['mux'] == reads_queue[-1]['mux']: logger.debug('Detected change in mux, yielding existing reads') yield self._create_read_obj(reads_queue, f5) reads_queue = [] # prepare for next grouping of reads. # if the new read is not a block, if we have accumulated reads, # yield them, then yield this read if meta['class'] in self.non_block_classes: if len(reads_queue) > 0: logger.debug( 'We have a non-block class, yielding existing reads') yield self._create_read_obj(reads_queue, f5) reads_queue = [] # prepare for next grouping of reads. logger.debug('We have a non-block class, yielding single read') yield self._create_read_obj([meta], f5) else: # this is a block, so append to reads reads_queue.append(meta) if len(reads_queue ) > 0: # if we reach the end of the run, yield the block yield self._create_read_obj(reads_queue, f5) if self.with_events or self.with_raw or self.with_states: f5.close()
def _get_levels(self, outpath, prefix, times=None, pore_rank=0, capture_rank=1, thresh_factor=0.9): """Calculate distribution of event means, and infer open-pore level and capture level. Assumes the pore level corresoponds to the highest-probability peak in the distribution, and that the capture level is the second highest. :param outpath: directory in which to plot the distribution and levels. :param prefix: prefix (prefixed to output plot path) :param times: (start time, end time) or None :param pore_rank: int, ranking of pore current within kde local maxima, defaults corresponds to highest probability peak. :param capture_rank: int, ranking of capture current within kde local maxima, defaults corresponds to second highest probability peak. :param thresh_factor: float, factor f with which to calculate boundary threshold; threshold = capture_level + f * (pore_level - capture_level) a value of 0.5 implies the midpoint between pore and capture. :returns: tuple of floats, (pore_level, capture_level, threshold) """ with BulkFast5(self.fast5) as fh: logger.info('Loading events for channel {}'.format(self.channel)) events = fh.get_events(self.channel, times=times) logger.info('Calculating kde for channel {}'.format(self.channel)) kde = gaussian_kde( events['mean'], bw_method='silverman' ) # silverman is seemingly better for multi-modal dists logger.info('Done calculating kde for channel {}'.format(self.channel)) x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100) pde_vals = kde(x) # evaluate density over grid max_inds = argrelmax(kde(x)) # find all local maxima max_probs = pde_vals[max_inds] sorted_inds = np.argsort(max_probs)[::-1] # so max prob is 1st elem pore_ind = max_inds[0][sorted_inds[pore_rank]] capture_ind = max_inds[0][sorted_inds[capture_rank]] pore_level = x[pore_ind] capture_level = x[capture_ind] threshold = capture_level + thresh_factor * (pore_level - capture_level) # plot kde, histogram and levels. fig, axis = plt.subplots() axis.hist(events['mean'], bins=100, color='k', label='histogram') axis.legend(loc='upper center', frameon=False) axis.set_xlim((-100, 400)) axis2 = axis.twinx() axis2.plot(x, kde(x), label='kde', color='k') axis2.plot(x[max_inds], pde_vals[max_inds], 'o', label='local maxima', color='b') axis2.plot(x[pore_ind], pde_vals[pore_ind], 'o', label='open pore current', color='r') axis2.plot(x[capture_ind], pde_vals[capture_ind], 'o', label='capture current', color='g') axis.axvline(threshold, label='threshold', color='magenta') axis2.legend(loc='upper left', frameon=False) plot_path = os.path.join( outpath, add_prefix('AdaptiveThresholdLevels_{}'.format( self.channel, prefix))) plt.savefig(plot_path, bbox_inches='tight', dpi=200) with open(plot_path + '.txt', 'w') as fh: fh.write('#pore rank {}\n'.format(pore_rank)) fh.write('#capture rank {}\n'.format(capture_rank)) fh.write('#thresh_factor {}\n'.format(thresh_factor)) fh.write('#pore level {}\n'.format(pore_level)) fh.write('#capture level {}\n'.format(capture_level)) fh.write('#threshold level {}\n'.format(threshold)) # write local maxima in kde distribution fh.write('# probability maxima in kde \n') fh.write('\t'.join(['pA', 'kde']) + '\n') for i in range(len(max_probs)): j = max_inds[0][sorted_inds[i]] fh.write('\t'.join(map(str, [x[j], pde_vals[j]])) + '\n') # write sampled kde fh.write('# kde points \n') fh.write('\t'.join(['pA', 'kde']) + '\n') for xi, yi in zip(x, pde_vals): fh.write('\t'.join(map(str, [xi, yi])) + '\n') return pore_level, capture_level, threshold
def extract_channel_reads(source, output, prefix, flat, by_id, max_files, multi, channel, summary=None): if flat: out_path = output # give multi files a channel prefix else they will # conflict between channels. Singles already get # a "ch" component in their name if multi: extra = 'ch{}'.format(channel) if prefix == '': prefix = extra else: prefix = '{}_{}'.format(prefix, extra) else: out_path = os.path.join(output, str(channel)) os.makedirs(out_path) with BulkFast5(source) as src: meta = src.get_metadata(channel) tracking_id = src.get_tracking_meta() context_tags = src.get_context_meta() channel_id = { 'channel_number': channel, 'range': meta['range'], 'digitisation': meta['digitisation'], 'offset': meta['offset'], 'sampling_rate': meta['sample_rate'] } Writer = MultiWriter if multi else SingleWriter with Writer(out_path, by_id, prefix=prefix) as writer: median_before = None counter = 1 raw_data = src.get_raw(channel, use_scaling=False) if summary is not None: # convert array into stream of dicts reads = ({k: row[k] for k in row.dtype.names} for row in summary) class_field = 'class' start_field = 'start_time' duration_field = 'duration' # if start_time is a float (seconds) we need to convert to # samples time_cols = ['start_time', 'duration'] else: reads = src.get_reads(channel) class_field = 'classification' start_field = 'read_start' duration_field = 'read_length' for read_number, read in enumerate(reads): if summary is not None: if 'median_current_before' in read: median_before = read['median_current_before'] else: median_before = 0.0 elif median_before is None: median_before = read['median'] continue if summary is None and read[class_field] != 'strand': median_before = read['median'] else: counter += 1 start = time_cast(read[start_field], meta['sample_rate']) length = time_cast(read[duration_field], meta['sample_rate']) read_id = { 'start_time': start, 'duration': length, 'read_number': read_number, 'start_mux': src.get_mux(channel, raw_index=start, wells_only=True), 'read_id': str(read['read_id']) if 'read_id' in read else str(uuid4()), 'scaling_used': 1, 'median_before': median_before } raw_slice = raw_data[start:start + length] read = Read(read_id, read_number, tracking_id, channel_id, context_tags, raw_slice) writer.write_read(read) if counter == max_files: break return counter, channel
def extract_reads(): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) logger = logging.getLogger('Extract Reads') parser = argparse.ArgumentParser( description='Bulk .fast5 to read .fast5 conversion.') parser.add_argument('input', help='Bulk .fast5 file for input.') parser.add_argument('output', help='Output folder.') out_format = parser.add_mutually_exclusive_group() out_format.add_argument('--multi', action='store_true', help='Output multi-read files.') out_format.add_argument('--single', action='store_false', dest='multi', help='Output single-read files.') parser.add_argument('--flat', default=False, action='store_true', help='Create all .fast5 files in one directory') parser.add_argument('--by_id', default=False, action='store_true', help='Name single-read .fast5 files by read_id.') parser.add_argument('--prefix', default="", help='Read file prefix.') parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).') parser.add_argument( '--summary', help= 'Strand summary file containing at least columns channel, start_time and duration).' ) parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.') parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.') args = parser.parse_args() if not os.path.exists(args.output): os.makedirs(args.output) else: raise IOError('The output directory must not exist.') if args.summary is not None: if not os.path.isfile(args.summary): raise IOError('The summary file does not exist.') else: # load summary args.summary = np.genfromtxt(args.summary, delimiter='\t', encoding=None, dtype=None, names=True) worker = functools.partial(extract_channel_reads, args.input, args.output, args.prefix, args.flat, args.by_id, args.limit, args.multi) if args.channel_range is None: with BulkFast5(args.input) as src: channels = src.channels else: channels = range(args.channel_range[0], args.channel_range[1] + 1) if args.summary is not None: # only process channels in the summary summ_channels = set(args.summary['channel']) channels = [ch for ch in channels if ch in summ_channels] summary_by_ch = { ch: args.summary[np.where(args.summary['channel'] == ch)] for ch in channels } else: summary_by_ch = collections.defaultdict(lambda: None) if args.workers > 1: with ProcessPoolExecutor(args.workers) as executor: futures = [ executor.submit(worker, c, summary=summary_by_ch[c]) for c in channels ] for future in as_completed(futures): try: n_reads, channel = future.result() except Exception as e: logger.warning("Error processing channel.") print(e) else: logger.info("Extracted {} reads from channel {}.".format( n_reads, channel)) else: for channel in channels: worker(channel, summary=summary_by_ch[channel]) logger.info("Finished.")
def reads(self): """Yield `Reads` with various meta data provided by MinKnow.""" with BulkFast5(self.fast5) as fh: # load channel, tracking and context meta so we don't need fast5 # later to e.g. write fast5 files. self.load_fast5_meta(fh) # use read classification from the penultimate block of multi-block reads for read in fh.get_reads(self.channel, penultimate_class=True): event_indices = (read['event_index_start'], read['event_index_end']) read_events = None if self.with_events: read_events = fh.get_events(self.channel, event_indices=event_indices) read_events = self._convert_event_fields( read_events, fh.sample_rate) # map new dict keys to read columns meta_keys = [('read_id', 'read_id'), ('initial_classification', 'classification'), ('median_current', 'median'), ('median_sd', 'median_sd'), ('range_current', 'range'), ('median_dwell', 'median_dwell'), ('start_time', 'read_start'), ('duration', 'read_length'), ('drift', 'drift')] meta = {key: read[col] for key, col in meta_keys} divide = ('median_dwell', 'duration', 'start_time') for name in divide: meta[name] = float(meta[name]) / fh.sample_rate meta.update({ 'num_events': event_indices[1] - event_indices[0], 'start_event': event_indices[0], 'end_event': event_indices[1], }) self._add_channel_states(fh, meta) if set(meta.keys()) != set(self.meta_keys): extra = set(meta.keys()) - set(self.meta_keys) missing = set(self.meta_keys) - set(meta.keys()) raise ValueError( '{} about to yield read with unexpected metrics. ' 'Extra: {}. Missing {}.'.format( self.__class__, extra, missing)) read_raw = None if self.with_raw: read_raw = fh.get_raw( self.channel, times=(meta['start_time'], meta['start_time'] + meta['duration']), use_scaling=False) if meta['start_time'] > self.max_time: raise StopIteration yield Read(events=read_events, raw=read_raw, meta=meta, channel_meta=self.channel_meta, context_meta=self.context_meta, tracking_meta=self.tracking_meta)