예제 #1
0
    def _process_read(self, read, read_metrics):
        self.n_reads += 1

        filename = 'read_ch{}_file{}.fast5'.format(self.channel, self.n_reads)
        filename = add_prefix(filename, self.prefix)
        # add filename to read_metrics so it can be reported in summaries
        read_metrics['filename'] = filename
        filename = os.path.join(self.outpath, filename)

        channel_id = {
            'channel_number': self.channel,
            'range': read.channel_meta['range'],
            'digitisation': read.channel_meta['digitisation'],
            'offset': read.channel_meta['offset'],
            'sample_rate': read.channel_meta['sample_rate'],
            'sampling_rate': read.channel_meta['sample_rate']
        }
        if read.events is None:
            raise RuntimeError('Read has no events data, cannot write fast5')
        events = read.events
        read_id = {
            'start_time': events['start'][0],
            'duration': events['start'][-1] + events['length'][-1] - events['start'][0],
            'read_number': self.n_reads,
            'start_mux': read_metrics['mux'],
            'read_id': read.meta['read_id'],
            'scaling_used': 1,
            'median_before': read_metrics['median_current_before'],
        }

        with Fast5.New(filename, 'a', tracking_id=read.tracking_meta,
                       context_tags=read.context_meta, channel_id=channel_id) as h:
            h.set_read(events, read_id)
            if read.raw is not None:
                h.set_raw(read.adc_raw)
예제 #2
0
    def test_060_write_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_float, self.tmp_read_id)

        # Metadata duration and start_time should be integers, not floats
        print tmp_file
        with Fast5(tmp_file, 'r') as h:
            for key in ['duration', 'start_time']:
                self.assertIsInstance(h.attributes[key], int)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(
                events['start'].dtype.descr[0][1], '<f8',
                'Writing float data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(
                actual, expected,
                'Write float, data on read not scaled correctly, got {} not {}'
                .format(actual, expected))

        os.unlink(tmp_file)
예제 #3
0
 def write_read(self, read):
     if self.by_id:
         filename = '{}.fast5'.format(read.read_id['read_id'])
     else:
         filename = '{}read_ch{}_file{}.fast5'.format(
             self.prefix, read.channel_id['channel_number'], read.read_number
         )
     filename = os.path.join(self.out_path, filename)
     with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h:
         h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number)
예제 #4
0
def extract_channel_reads(source, output, prefix, flat, by_id, channel):

    if flat:
        out_path = output
    else:
        out_path = os.path.join(output, str(channel))
        os.makedirs(out_path)

    with BulkFast5(source) as src:
        raw_data = src.get_raw(channel, use_scaling=False)
        meta = src.get_metadata(channel)
        tracking_id = src.get_tracking_meta()
        context_tags = src.get_context_meta()
        channel_id = {
            'channel_number': channel,
            'range': meta['range'],
            'digitisation': meta['digitisation'],
            'offset': meta['offset'],
            'sample_rate': meta['sample_rate'],
            'sampling_rate': meta['sample_rate']
        }
        median_before = None
        counter = 1
        for read_number, read in enumerate(src.get_reads(channel)):
            if median_before is None:
                median_before = read['median']
                continue

            if read['classification'] != 'strand':
                median_before = read['median']
            else:
                counter += 1
                start, length = read['read_start'], read['read_length']
                read_id = {
                    'start_time': read['read_start'],
                    'duration': read['read_length'],
                    'read_number': read_number,
                    'start_mux': src.get_mux(channel, raw_index=start, wells_only=True),
                    'read_id': read['read_id'],
                    'scaling_used': 1,
                    'median_before': median_before
                }

                raw_slice = raw_data[start:start+length]
                if by_id:
                    filename = '{}.fast5'.format(read['read_id'])
                else:
                    filename =  '{}_read_ch{}_file{}.fast5'.format(
                        prefix, channel, read_number
                    )
                filename = os.path.join(out_path, filename)
                with Fast5.New(filename, 'a', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h:
                    h.set_raw(raw_slice, meta=read_id, read_number=read_number)
    return counter, channel
예제 #5
0
    def test_065_write_int_read_float_data(self):
        tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))

        with Fast5.New(tmp_file, 'a', channel_id=self.tmp_channel_id) as h:
            h.set_read(self.tmp_events_int, self.tmp_read_id)

        with Fast5(tmp_file) as h:
            events = h.get_read()
            self.assertEqual(
                events['start'].dtype.descr[0][1], '<f8',
                'Writing uint data did not give float data on read.')
            actual = events['start'][0]
            expected = self.tmp_events_float['start'][0]
            self.assertEqual(
                actual, expected,
                'Write unit, data on read not scaled correctly, got {} not {}'.
                format(actual, expected))

        os.unlink(tmp_file)
    def setUpClass(self):
        """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data"""
        print '* Fast5 Basecaller and Mapper'

        self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA'
        self.qstring = '!' * len(self.seq)
        self.model_file = 'example_template.model'
        self.events_file = 'example_template.events'
        self.model_file = 'example_template.model'
        self.bc_scale_file = 'example_template.bc_scale'
        self.bc_path_file = 'example_template.bc_path'
        self.map_scale_file = 'example_template.map_scale'
        self.map_path_file = 'example_template.map_path'
        self.map_post_file = 'example_template.map_post'
        self.ref_name = 'test_seq'

        # Open new file
        header = [
            'channel_number', 'offset', 'range', 'digitisation',
            'sampling_rate'
        ]
        channel_id = {x: 0 for x in header}
        fakefile = tempfile.NamedTemporaryFile()
        self.fh = Fast5.New(fakefile.name, channel_id=channel_id, read='a')

        # load data to set within fast5 file
        self.model = np.genfromtxt(self.get_file_path(self.model_file),
                                   dtype=None,
                                   delimiter='\t',
                                   names=True)
        self.events = np.genfromtxt(self.get_file_path(self.events_file),
                                    dtype=None,
                                    delimiter='\t',
                                    names=True)

        # use namedtuple to imitate a Scale object
        Scale = namedtuple(
            'Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'])

        bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file),
                                        dtype=None,
                                        delimiter='\t'))
        bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file),
                                dtype=np.int32,
                                delimiter='\t')

        self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model,
                                  self.seq)

        map_scale = Scale(
            *np.genfromtxt(self.get_file_path(self.map_scale_file),
                           dtype=None,
                           delimiter='\t'))
        map_path = np.genfromtxt(self.get_file_path(self.map_path_file),
                                 dtype=np.int32,
                                 delimiter='\t')
        map_post = np.genfromtxt(self.get_file_path(self.map_post_file),
                                 delimiter='\t')

        n_states = len(self.seq) - len(self.model['kmer'][0]) + 1
        self.fh.set_mapping_data(self.events, map_scale, map_path, self.model,
                                 self.seq, self.ref_name)
        self.fh.set_mapping_data(self.events,
                                 map_scale,
                                 map_path,
                                 self.model,
                                 self.seq,
                                 self.ref_name,
                                 post=map_post)