Exemplo n.º 1
0
    def __init__(self,
                 fast5,
                 channel,
                 *args,
                 good_class='strand',
                 time_warp=1,
                 **kwargs):
        """An RPC service for replaying a channel from a .fast5 file.

        :param fast5: input filename.
        :param channel: channel to simulate.
        :param good_class: read classification name of desirable reads.
        :param time_warp: time multiplier for playback speed.

        ..note:: `args` and `kwargs` are passed to `aiozmq.rpc.AttrHandler`.
        
        """
        super().__init__(*args, **kwargs)
        self.fast5 = fast5
        self.channel = channel
        self.good_class = good_class
        self.time_warp = time_warp
        self.logger = logging.getLogger('Replay Channel {}'.format(
            channel.zfill(4)))

        with BulkFast5(self.fast5) as fh:
            self.sample_rate = fh.sample_rate
            self.time_base = 1.0 / self.sample_rate
            self.reads = [x for x in fh.get_reads(self.channel)]
            self.read_starts = np.ascontiguousarray(
                [x['read_start'] for x in self.reads])

        self.reset_time()
Exemplo n.º 2
0
    def get_events(self, n_events=400):
        """Return events from the start of the current read.

        :param nevents: maximum number of events to return.

        :returns: Serialized event data, see :class:`Fast5Data`.
        """
        self.logger.debug("Request for events at {}".format(
            self.current_sample))
        read = self.reads[self.current_read]
        if read['classification'] != self.good_class:
            return None
        else:
            start = int(read['event_index_start'])
            end = min(self.current_event, start + n_events)
            self.logger.debug(
                "Fetching events [{}, {}] for read {} starting at {}. Current sample is {}."
                .format(
                    start, end, self.current_read,
                    int(self.sample_offset +
                        self.reads[self.current_read]['read_start']),
                    self.current_sample))
            with BulkFast5(self.fast5) as fh:
                events = fh.get_events(self.channel,
                                       event_indices=[start, end])
                return Fast5Data(
                    events,
                    info=str(read['read_id']),
                    start=int(self.sample_offset + read['read_start']),
                    end=int(self.sample_offset + read['read_start'] +
                            read['read_length']))
Exemplo n.º 3
0
    def get_raw(self, seconds=1, delay=0.5):
        """Return events from the start of the current read.

        :param nevents: maximum number of events to return

        :returns: Serialized raw data, see :class:`Fast5Data`.
        """
        self.logger.debug("Request for raw at {}".format(self.current_sample))
        read = self.reads[self.current_read]
        if read['classification'] != self.good_class:
            return None
        else:
            start = int(read['read_start'] + self.sample_rate * delay)
            end = int(
                min(self.current_sample, start + self.sample_rate * seconds))
            if end <= start:
                return None
            self.logger.debug(
                "Fetching raw [{}, {}] for read {} starting at {}. Current sample is {}."
                .format(
                    start, end, self.current_read,
                    int(self.sample_offset +
                        self.reads[self.current_read]['read_start']),
                    self.current_sample))
            with BulkFast5(self.fast5) as fh:
                raw = fh.get_raw(self.channel, raw_indices=[start, end])
                return Fast5Data(
                    raw,
                    info=str(read['read_id']),
                    start=int(self.sample_offset + read['read_start']),
                    end=int(self.sample_offset + read['read_start'] +
                            read['read_length']))
Exemplo n.º 4
0
    def reads(self):
        """Yield `Reads` obtained from delta splitting."""

        with BulkFast5(self.fast5) as fh:

            # load channel, tracking and context meta so we don't need fast5
            # later to e.g. write fast5 files.
            self.load_fast5_meta(fh)

            events = fh.get_events(self.channel)

            bounds = read_bounds_from_delta(events['mean'],
                                            delta=self.delta,
                                            look_back_n=self.look_back)
            for event_indices in bounds:
                read_events = None
                if self.with_events:
                    read_events = events[event_indices[0]:event_indices[1]]
                    read_events = self._convert_event_fields(
                        read_events, fh.sample_rate)

                meta = {
                    'start_time':
                    read_events[0]['start'],
                    'duration':
                    read_events[-1]['start'] + read_events[-1]['length'] -
                    read_events[0]['start'],
                    'num_events':
                    event_indices[1] - event_indices[0],
                    'start_event':
                    event_indices[0],
                    'end_event':
                    event_indices[1],
                }
                self._add_channel_states(fh, meta)

                if set(meta.keys()) != set(self.meta_keys):
                    extra = set(meta.keys()) - set(self.meta_keys)
                    missing = set(self.meta_keys) - set(meta.keys())
                    raise ValueError(
                        '{} about to yield read with unexpected metrics. '
                        'Extra: {}. Missing {}.'.format(
                            self.__class__, extra, missing))
                read_raw = None
                if self.with_raw:
                    read_raw = fh.get_raw(
                        self.channel,
                        times=(meta['start_time'],
                               meta['start_time'] + meta['duration']),
                        use_scaling=False)

                if meta['start_time'] > self.max_time:
                    raise StopIteration

                yield Read(events=read_events,
                           raw=read_raw,
                           meta=meta,
                           channel_meta=self.channel_meta,
                           context_meta=self.context_meta,
                           tracking_meta=self.tracking_meta)
Exemplo n.º 5
0
 def reads(self):
     with BulkFast5(self.fast5) as fh:
         # load channel, tracking and context meta so we don't need fast5
         # later to e.g. write fast5 files.
         self.load_fast5_meta(fh)
         for chunk in iterators.blocker(fh.get_events(self.channel),
                                        self.interval):
             yield Read(events=chunk)
Exemplo n.º 6
0
 def current_event(self):
     """Index of the current event."""
     prev = self._current_event
     with BulkFast5(self.fast5) as fh:
         event_path = fh.__event_data__.format(self.channel)
         new = np.searchsorted(fh[event_path][prev:]['start'],
                               self.current_sample)
     self._current_event = int(new + prev)
     return self._current_event
Exemplo n.º 7
0
    def _get_levels(self, outpath, prefix):
        """Calculate distribution of event means, and infer open-pore level and  capture level.

        Assumes the pore level corresoponds to the highest-probability peak in
        the distribution, and that the capture level is the second highest.

        :param outpath: directory in which to plot the distribution and levels.
        :param prefix: prefix (prefixed to output plot path)
        :returns: tuple of floats, (pore_level, capture level)
        """
        with BulkFast5(self.fast5) as fh:
            events = fh.get_events(self.channel)

        kde = gaussian_kde(
            events['mean'], bw_method='silverman'
        )  # silverman is seemingly better for multi-modal dists
        x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100)

        pde_vals = kde(x)  # evaluate density over grid
        max_inds = argrelmax(kde(x))  # find all local maxima
        max_probs = pde_vals[max_inds]
        sorted_inds = np.argsort(max_probs)
        max_ind = max_inds[0][
            sorted_inds[-1]]  # index of maxima in x and max_probs
        second_max_ind = max_inds[0][sorted_inds[-2]]

        pore_level = x[max_ind]
        capture_level = x[second_max_ind]

        # plot kde, histogram and levels.
        fig, axis = plt.subplots()
        axis.hist(events['mean'], bins=100, color='k', label='histogram')
        axis.legend(loc='upper center', frameon=False)
        axis2 = axis.twinx()
        axis2.plot(x, kde(x), label='kde', color='k')
        axis2.plot(x[max_inds],
                   pde_vals[max_inds],
                   'o',
                   label='local maxima',
                   color='b')
        axis2.plot(x[max_ind],
                   pde_vals[max_ind],
                   'o',
                   label='open pore current',
                   color='r')
        axis2.plot(x[second_max_ind],
                   pde_vals[second_max_ind],
                   'o',
                   label='capture current',
                   color='g')
        axis2.legend(loc='upper left', frameon=False)
        plot_path = os.path.join(outpath,
                                 add_prefix('AdaptiveThresholdLevels', prefix))
        plt.savefig(plot_path, bbox_inches='tight', dpi=200)

        return pore_level, capture_level
Exemplo n.º 8
0
    def reads(self):
        """Yield `Reads` with various meta data provided by MinKnow."""

        if self.with_events or self.with_raw or self.with_states:
            f5 = BulkFast5(self.fast5)
            # load channel, tracking and context meta so we don't need fast5
            # later to e.g. write fast5 files.
            self.load_fast5_meta(f5)
        else:
            # initialise fast5/channel meta variables so we have a generic call
            # to Read constructor even when we don't have the fast5
            self.channel_meta = None
            self.context_meta = None
            self.tracking_meta = None

        if self.with_events:
            if set(['start_event', 'end_event']).issubset(self.meta_keys):
                get_events = lambda meta: f5.get_events(
                    self.channel,
                    event_indices=(meta['start_event'], meta['end_event']))
            else:
                logger.warn('Reading events using timings, this will be slow.')
                get_events = lambda meta: f5.get_events(
                    self.channel,
                    times=(meta['start_time'], meta['start_time'] + meta[
                        'duration']))

        for meta in self.iterate_input_file():
            read_events = None
            if self.with_events:
                read_events = get_events(meta)
                read_events = self._convert_event_fields(
                    read_events, f5.sample_rate)
            read_raw = None
            if self.with_raw:
                read_raw = f5.get_raw(
                    self.channel,
                    times=(meta['start_time'],
                           meta['start_time'] + meta['duration']),
                    use_scaling=False)
            if self.with_states:  # add mux, channel states from fast5
                self._add_channel_states(f5, meta)

            if meta['start_time'] > self.max_time:
                raise StopIteration

            yield Read(events=read_events,
                       raw=read_raw,
                       meta=meta,
                       channel_meta=self.channel_meta,
                       context_meta=self.context_meta,
                       tracking_meta=self.tracking_meta)

        if self.with_events or self.with_raw or self.with_states:
            f5.close()
Exemplo n.º 9
0
def extract_channel_reads(source, output, prefix, flat, by_id, channel):

    if flat:
        out_path = output
    else:
        out_path = os.path.join(output, str(channel))
        os.makedirs(out_path)

    with BulkFast5(source) as src:
        raw_data = src.get_raw(channel, use_scaling=False)
        meta = src.get_metadata(channel)
        tracking_id = src.get_tracking_meta()
        context_tags = src.get_context_meta()
        channel_id = {
            'channel_number': channel,
            'range': meta['range'],
            'digitisation': meta['digitisation'],
            'offset': meta['offset'],
            'sample_rate': meta['sample_rate'],
            'sampling_rate': meta['sample_rate']
        }
        median_before = None
        counter = 1
        for read_number, read in enumerate(src.get_reads(channel)):
            if median_before is None:
                median_before = read['median']
                continue

            if read['classification'] != 'strand':
                median_before = read['median']
            else:
                counter += 1
                start, length = read['read_start'], read['read_length']
                read_id = {
                    'start_time': read['read_start'],
                    'duration': read['read_length'],
                    'read_number': read_number,
                    'start_mux': src.get_mux(channel, raw_index=start, wells_only=True),
                    'read_id': read['read_id'],
                    'scaling_used': 1,
                    'median_before': median_before
                }

                raw_slice = raw_data[start:start+length]
                if by_id:
                    filename = '{}.fast5'.format(read['read_id'])
                else:
                    filename =  '{}_read_ch{}_file{}.fast5'.format(
                        prefix, channel, read_number
                    )
                filename = os.path.join(out_path, filename)
                with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h:
                    h.set_raw(raw_slice, meta=read_id, read_number=read_number)
    return counter, channel
Exemplo n.º 10
0
def extract_read_summary():
    logging.basicConfig(
        format='[%(asctime)s - %(name)s] %(message)s',
        datefmt='%H:%M:%S', level=logging.INFO
    )
    logger = logging.getLogger('Summarize Reads')
    parser = argparse.ArgumentParser(description='Summarize reads stored in a Bulk .fast5')
    parser.add_argument('input', help='Bulk .fast5 file for input.')
    parser.add_argument('output', help='Output text file.')
    parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).')
    args = parser.parse_args()

    if args.channel_range is None:
        with BulkFast5(args.input) as src:
            channels = src.channels
    else:
        channels = range(args.channel_range[0], args.channel_range[1] + 1)

    with BulkFast5(args.input) as src, open(args.output, 'w') as out_fh:
        extract_read_summary_internal(src, channels, out_fh, logger)
Exemplo n.º 11
0
def main(args=None):
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)

    if args is None:
        args = sys.argv[1:]

    # process args and get yaml string of all options
    args, yaml_conf_out = process_args(args)

    logging.debug('args are: {}'.format(args))

    logger.info("Will stop after {} seconds of expt time.".format(
        args['max_time']))

    # Multiple components will write here
    if 'outpath' in args and args['outpath'] is not None:
        os.mkdir(args['outpath'])

    # save the config
    if args['config_out'] is not None:
        path = add_prefix(args['config_out'], args['prefix'])
        if 'outpath' in args and args['outpath'] is not None:
            path = os.path.join(args['outpath'], path)
        with open(path, 'w') as fh:
            fh.write(yaml_conf_out)

    # Get channel range from bulk if it was not specified
    if args['channels'] is None:
        with BulkFast5(args['fast5'], 'r') as f5:
            args['channels'] = list(f5.channels)
    else:
        args['channels'] = list(args['channels'])

    # Test pipeline can be constructed
    read_generator, metric_calculator, classifier, second_stage = get_pipeline(
        args, args['channels'][0])
    logger.info('Splitter      : {}.'.format(read_generator))
    logger.info('Metrifier     : {}.'.format(metric_calculator))
    logger.info('Classifier    : {}.'.format(classifier))
    logger.info('SecondStage   : {}.'.format(second_stage))

    # Accumulators gather results from individual channels
    accumulators = get_accumulators(args)
    logger.info('Accumulators  : {}.'.format(accumulators))
    for read_metrics in accumulate_channels(args):
        for accumulator in accumulators:
            accumulator.process_read(read_metrics)

    # Finish up accumulators
    for accumulator in accumulators:
        accumulator.finalize()
Exemplo n.º 12
0
    def reads(self):
        with BulkFast5(self.fast5) as fh:

            # load channel, tracking and context meta so we don't need fast5
            # later to e.g. write fast5 files.
            self.load_fast5_meta(fh)

            events = fh.get_events(self.channel)
            # convert event fields 'start' and 'length' from raw indices into times
            for col in ['start', 'length']:
                times = events[col] / fh.sample_rate
                events = drop_fields(events, col, usemask=False)
                events = append_fields(events, col, times, usemask=False)

            read_bound_event_indices = np.where(
                np.ediff1d((
                    events['mean'] < self.threshold).astype(int)) != 0)[0]
            # first event should be start of first read
            read_bound_event_indices = np.insert(read_bound_event_indices + 1,
                                                 0, 0)
            # pad end with last event index + 1.
            read_bound_event_indices = np.append(read_bound_event_indices,
                                                 len(events) - 1)
            for start_event, next_start_event in iterators.window(
                    read_bound_event_indices, 2):
                start_t, end_t = events[start_event]['start'], events[
                    next_start_event]['start']
                meta = {
                    'start_time': start_t,
                    'duration': end_t - start_t,
                    'pore_level': self.pore_level,
                    'capture_level': self.capture_level,
                    'threshold': self.threshold,
                }
                self._add_channel_states(fh, meta)
                read_raw = None
                if self.with_raw:
                    read_raw = fh.get_raw(self.channel,
                                          times=(start_t, end_t),
                                          use_scaling=False)

                if meta['start_time'] > self.max_time:
                    raise StopIteration

                yield Read(events=read_events,
                           raw=read_raw,
                           meta=meta,
                           channel_meta=self.channel_meta,
                           context_meta=self.context_meta,
                           tracking_meta=self.tracking_meta)
Exemplo n.º 13
0
def extract_single_reads():
    logging.basicConfig(
        format='[%(asctime)s - %(name)s] %(message)s',
        datefmt='%H:%M:%S', level=logging.INFO
    )
    logger = logging.getLogger('Extract Reads')
    parser = argparse.ArgumentParser(description='Bulk .fast5 to single read .fast5 conversion.')
    parser.add_argument('input', help='Bulk .fast5 file for input.')
    parser.add_argument('output', help='Output folder.')
    parser.add_argument('--flat', default=False, action='store_true',
                        help='Create all .fast5 files in one directory')
    parser.add_argument('--by_id', default=False, action='store_true',
                        help='Name single-read .fast5 files by read_id.')
    parser.add_argument('--prefix', default='read', help='Read file prefix.')
    parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).')
    parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.')
    args = parser.parse_args()

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    else:
        raise IOError('The output directory must not exist.')

    worker = functools.partial(
        extract_channel_reads,
        args.input, args.output, args.prefix, args.flat, args.by_id,
    )

    if args.channel_range is None:
        with BulkFast5(args.input) as src:
            channels = src.channels
    else:
        channels = range(args.channel_range[0], args.channel_range[1] + 1)

    if args.workers > 1:
        with ProcessPoolExecutor(args.workers) as executor:
            futures = [executor.submit(worker, c) for c in channels]
            for future in as_completed(futures):
                try:
                    n_reads, channel = future.result()
                except Exception:
                    logger.warning("Error processing channel.")
                else:
                    logger.info("Extracted {} reads from channel {}.".format(n_reads, channel))
    else:
        for channel in channels:
            worker(channel)
    logger.info("Finished.")
Exemplo n.º 14
0
    def _process_reads(self, reads, metrics):

        if self.with_events or self.with_raw or self.with_states:
            f5 = BulkFast5(self.fast5)
            # load channel, tracking and context meta so we don't need fast5
            # later to e.g. write fast5 files.
            self.load_fast5_meta(f5)

        reads_queue = []
        for read, meta in itertools.izip(reads, metrics):
            # TODO: at the moment we don't use the read objects as its simpler
            # just to load what we want from the fast5 but we could imagine
            # combining the events, raw, state_changes and mux_changes from
            # individual reads into a new read object, so we don't need a fast5
            logger.debug('Read: channel {} mux {} time {} class {}'.format(
                meta['channel'], meta['mux'], meta['start_time'],
                meta['class']))
            # if we have have accumulated any reads, yield them if the
            # well_id has changed. Note that unblock_voltage_1 and
            # common_voltage_1 both enumerate to 1, so this will not stop us
            # joining up blocks interrupted by flicks.
            if len(reads_queue
                   ) > 0 and not meta['mux'] == reads_queue[-1]['mux']:
                logger.debug('Detected change in mux, yielding existing reads')
                yield self._create_read_obj(reads_queue, f5)
                reads_queue = []  # prepare for next grouping of reads.
            # if the new read is not a block, if we have accumulated reads,
            # yield them, then yield this read
            if meta['class'] in self.non_block_classes:
                if len(reads_queue) > 0:
                    logger.debug(
                        'We have a non-block class, yielding existing reads')
                    yield self._create_read_obj(reads_queue, f5)
                    reads_queue = []  # prepare for next grouping of reads.
                logger.debug('We have a non-block class, yielding single read')
                yield self._create_read_obj([meta], f5)
            else:  # this is a block, so append to reads
                reads_queue.append(meta)
        if len(reads_queue
               ) > 0:  # if we reach the end of the run, yield the block
            yield self._create_read_obj(reads_queue, f5)

        if self.with_events or self.with_raw or self.with_states:
            f5.close()
Exemplo n.º 15
0
    def _get_levels(self,
                    outpath,
                    prefix,
                    times=None,
                    pore_rank=0,
                    capture_rank=1,
                    thresh_factor=0.9):
        """Calculate distribution of event means, and infer open-pore level and  capture level.

        Assumes the pore level corresoponds to the highest-probability peak in
        the distribution, and that the capture level is the second highest.

        :param outpath: directory in which to plot the distribution and levels.
        :param prefix: prefix (prefixed to output plot path)
        :param times: (start time, end time) or None
        :param pore_rank: int, ranking of pore current within kde local maxima,
               defaults corresponds to highest probability peak.
        :param capture_rank: int, ranking of capture current within kde local maxima,
               defaults corresponds to second highest probability peak.
        :param thresh_factor: float, factor f with which to calculate boundary threshold;
                threshold = capture_level + f * (pore_level - capture_level)
                a value of 0.5 implies the midpoint between pore and capture.
        :returns: tuple of floats, (pore_level, capture_level, threshold)
        """
        with BulkFast5(self.fast5) as fh:
            logger.info('Loading events for channel {}'.format(self.channel))
            events = fh.get_events(self.channel, times=times)

        logger.info('Calculating kde for channel {}'.format(self.channel))
        kde = gaussian_kde(
            events['mean'], bw_method='silverman'
        )  # silverman is seemingly better for multi-modal dists
        logger.info('Done calculating kde for channel {}'.format(self.channel))
        x = np.linspace(np.min(events['mean']), np.max(events['mean']), 100)

        pde_vals = kde(x)  # evaluate density over grid
        max_inds = argrelmax(kde(x))  # find all local maxima
        max_probs = pde_vals[max_inds]
        sorted_inds = np.argsort(max_probs)[::-1]  # so max prob is 1st elem
        pore_ind = max_inds[0][sorted_inds[pore_rank]]
        capture_ind = max_inds[0][sorted_inds[capture_rank]]

        pore_level = x[pore_ind]
        capture_level = x[capture_ind]
        threshold = capture_level + thresh_factor * (pore_level -
                                                     capture_level)

        # plot kde, histogram and levels.
        fig, axis = plt.subplots()
        axis.hist(events['mean'], bins=100, color='k', label='histogram')
        axis.legend(loc='upper center', frameon=False)
        axis.set_xlim((-100, 400))
        axis2 = axis.twinx()

        axis2.plot(x, kde(x), label='kde', color='k')
        axis2.plot(x[max_inds],
                   pde_vals[max_inds],
                   'o',
                   label='local maxima',
                   color='b')
        axis2.plot(x[pore_ind],
                   pde_vals[pore_ind],
                   'o',
                   label='open pore current',
                   color='r')
        axis2.plot(x[capture_ind],
                   pde_vals[capture_ind],
                   'o',
                   label='capture current',
                   color='g')
        axis.axvline(threshold, label='threshold', color='magenta')
        axis2.legend(loc='upper left', frameon=False)
        plot_path = os.path.join(
            outpath,
            add_prefix('AdaptiveThresholdLevels_{}'.format(
                self.channel, prefix)))
        plt.savefig(plot_path, bbox_inches='tight', dpi=200)
        with open(plot_path + '.txt', 'w') as fh:
            fh.write('#pore rank {}\n'.format(pore_rank))
            fh.write('#capture rank {}\n'.format(capture_rank))
            fh.write('#thresh_factor {}\n'.format(thresh_factor))
            fh.write('#pore level {}\n'.format(pore_level))
            fh.write('#capture level {}\n'.format(capture_level))
            fh.write('#threshold level {}\n'.format(threshold))
            # write local maxima in kde distribution
            fh.write('# probability maxima in kde \n')
            fh.write('\t'.join(['pA', 'kde']) + '\n')
            for i in range(len(max_probs)):
                j = max_inds[0][sorted_inds[i]]
                fh.write('\t'.join(map(str, [x[j], pde_vals[j]])) + '\n')
            # write sampled kde
            fh.write('# kde points \n')
            fh.write('\t'.join(['pA', 'kde']) + '\n')
            for xi, yi in zip(x, pde_vals):
                fh.write('\t'.join(map(str, [xi, yi])) + '\n')

        return pore_level, capture_level, threshold
Exemplo n.º 16
0
def extract_channel_reads(source,
                          output,
                          prefix,
                          flat,
                          by_id,
                          max_files,
                          multi,
                          channel,
                          summary=None):
    if flat:
        out_path = output
        # give multi files a channel prefix else they will
        # conflict between channels. Singles already get
        # a "ch" component in their name
        if multi:
            extra = 'ch{}'.format(channel)
            if prefix == '':
                prefix = extra
            else:
                prefix = '{}_{}'.format(prefix, extra)
    else:
        out_path = os.path.join(output, str(channel))
        os.makedirs(out_path)

    with BulkFast5(source) as src:
        meta = src.get_metadata(channel)
        tracking_id = src.get_tracking_meta()
        context_tags = src.get_context_meta()
        channel_id = {
            'channel_number': channel,
            'range': meta['range'],
            'digitisation': meta['digitisation'],
            'offset': meta['offset'],
            'sampling_rate': meta['sample_rate']
        }

        Writer = MultiWriter if multi else SingleWriter
        with Writer(out_path, by_id, prefix=prefix) as writer:

            median_before = None
            counter = 1
            raw_data = src.get_raw(channel, use_scaling=False)

            if summary is not None:
                # convert array into stream of dicts
                reads = ({k: row[k]
                          for k in row.dtype.names} for row in summary)
                class_field = 'class'
                start_field = 'start_time'
                duration_field = 'duration'
                # if start_time is a float (seconds) we need to convert to
                # samples
                time_cols = ['start_time', 'duration']
            else:
                reads = src.get_reads(channel)
                class_field = 'classification'
                start_field = 'read_start'
                duration_field = 'read_length'

            for read_number, read in enumerate(reads):

                if summary is not None:
                    if 'median_current_before' in read:
                        median_before = read['median_current_before']
                    else:
                        median_before = 0.0
                elif median_before is None:
                    median_before = read['median']
                    continue

                if summary is None and read[class_field] != 'strand':
                    median_before = read['median']
                else:
                    counter += 1
                    start = time_cast(read[start_field], meta['sample_rate'])
                    length = time_cast(read[duration_field],
                                       meta['sample_rate'])
                    read_id = {
                        'start_time':
                        start,
                        'duration':
                        length,
                        'read_number':
                        read_number,
                        'start_mux':
                        src.get_mux(channel, raw_index=start, wells_only=True),
                        'read_id':
                        str(read['read_id'])
                        if 'read_id' in read else str(uuid4()),
                        'scaling_used':
                        1,
                        'median_before':
                        median_before
                    }

                    raw_slice = raw_data[start:start + length]
                    read = Read(read_id, read_number, tracking_id, channel_id,
                                context_tags, raw_slice)
                    writer.write_read(read)
                    if counter == max_files:
                        break
    return counter, channel
Exemplo n.º 17
0
def extract_reads():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    logger = logging.getLogger('Extract Reads')
    parser = argparse.ArgumentParser(
        description='Bulk .fast5 to read .fast5 conversion.')
    parser.add_argument('input', help='Bulk .fast5 file for input.')
    parser.add_argument('output', help='Output folder.')
    out_format = parser.add_mutually_exclusive_group()
    out_format.add_argument('--multi',
                            action='store_true',
                            help='Output multi-read files.')
    out_format.add_argument('--single',
                            action='store_false',
                            dest='multi',
                            help='Output single-read files.')
    parser.add_argument('--flat',
                        default=False,
                        action='store_true',
                        help='Create all .fast5 files in one directory')
    parser.add_argument('--by_id',
                        default=False,
                        action='store_true',
                        help='Name single-read .fast5 files by read_id.')
    parser.add_argument('--prefix', default="", help='Read file prefix.')
    parser.add_argument('--channel_range',
                        nargs=2,
                        type=int,
                        default=None,
                        help='Channel range (inclusive).')
    parser.add_argument(
        '--summary',
        help=
        'Strand summary file containing at least columns channel, start_time and duration).'
    )
    parser.add_argument('--workers',
                        type=int,
                        default=4,
                        help='Number of worker processes.')
    parser.add_argument('--limit',
                        type=int,
                        default=None,
                        help='Limit reads per channel.')
    args = parser.parse_args()

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    else:
        raise IOError('The output directory must not exist.')

    if args.summary is not None:
        if not os.path.isfile(args.summary):
            raise IOError('The summary file does not exist.')
        else:
            # load summary
            args.summary = np.genfromtxt(args.summary,
                                         delimiter='\t',
                                         encoding=None,
                                         dtype=None,
                                         names=True)

    worker = functools.partial(extract_channel_reads, args.input, args.output,
                               args.prefix, args.flat, args.by_id, args.limit,
                               args.multi)

    if args.channel_range is None:
        with BulkFast5(args.input) as src:
            channels = src.channels
    else:
        channels = range(args.channel_range[0], args.channel_range[1] + 1)

    if args.summary is not None:
        # only process channels in the summary
        summ_channels = set(args.summary['channel'])
        channels = [ch for ch in channels if ch in summ_channels]
        summary_by_ch = {
            ch: args.summary[np.where(args.summary['channel'] == ch)]
            for ch in channels
        }
    else:
        summary_by_ch = collections.defaultdict(lambda: None)

    if args.workers > 1:
        with ProcessPoolExecutor(args.workers) as executor:
            futures = [
                executor.submit(worker, c, summary=summary_by_ch[c])
                for c in channels
            ]
            for future in as_completed(futures):
                try:
                    n_reads, channel = future.result()
                except Exception as e:
                    logger.warning("Error processing channel.")
                    print(e)
                else:
                    logger.info("Extracted {} reads from channel {}.".format(
                        n_reads, channel))
    else:
        for channel in channels:
            worker(channel, summary=summary_by_ch[channel])
    logger.info("Finished.")
Exemplo n.º 18
0
    def reads(self):
        """Yield `Reads` with various meta data provided by MinKnow."""
        with BulkFast5(self.fast5) as fh:

            # load channel, tracking and context meta so we don't need fast5
            # later to e.g. write fast5 files.
            self.load_fast5_meta(fh)

            # use read classification from the penultimate block of multi-block reads
            for read in fh.get_reads(self.channel, penultimate_class=True):
                event_indices = (read['event_index_start'],
                                 read['event_index_end'])

                read_events = None
                if self.with_events:
                    read_events = fh.get_events(self.channel,
                                                event_indices=event_indices)
                    read_events = self._convert_event_fields(
                        read_events, fh.sample_rate)

                # map new dict keys to read columns
                meta_keys = [('read_id', 'read_id'),
                             ('initial_classification', 'classification'),
                             ('median_current', 'median'),
                             ('median_sd', 'median_sd'),
                             ('range_current', 'range'),
                             ('median_dwell', 'median_dwell'),
                             ('start_time', 'read_start'),
                             ('duration', 'read_length'), ('drift', 'drift')]
                meta = {key: read[col] for key, col in meta_keys}
                divide = ('median_dwell', 'duration', 'start_time')
                for name in divide:
                    meta[name] = float(meta[name]) / fh.sample_rate

                meta.update({
                    'num_events': event_indices[1] - event_indices[0],
                    'start_event': event_indices[0],
                    'end_event': event_indices[1],
                })
                self._add_channel_states(fh, meta)

                if set(meta.keys()) != set(self.meta_keys):
                    extra = set(meta.keys()) - set(self.meta_keys)
                    missing = set(self.meta_keys) - set(meta.keys())
                    raise ValueError(
                        '{} about to yield read with unexpected metrics. '
                        'Extra: {}. Missing {}.'.format(
                            self.__class__, extra, missing))
                read_raw = None
                if self.with_raw:
                    read_raw = fh.get_raw(
                        self.channel,
                        times=(meta['start_time'],
                               meta['start_time'] + meta['duration']),
                        use_scaling=False)

                if meta['start_time'] > self.max_time:
                    raise StopIteration

                yield Read(events=read_events,
                           raw=read_raw,
                           meta=meta,
                           channel_meta=self.channel_meta,
                           context_meta=self.context_meta,
                           tracking_meta=self.tracking_meta)