def test_subset_from_multi(self, mock_log, mock_pbar):
     read_list = self._create_read_list_file(self.read_set)
     f5_filter = Fast5Filter(input_folder=os.path.dirname(self.input_multif5_path),
                             output_folder=self.save_path,
                             read_list_file=read_list)
     f5_filter.run_batch()
     with MultiFast5File(self.input_multif5_path, 'r') as input_f5, \
             MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
         self.assertEqual(len(self.read_set), len(output_f5.get_read_ids()))
         for read_id in self.read_set:
             read_in = input_f5.get_read(read_id)
             read_out = output_f5.get_read(read_id)
             self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
Exemplo n.º 2
0
def compress_file(input_file, output_file, target_compression):
    try:
        makedirs(os.path.dirname(output_file), exist_ok=True)
        if is_multi_read(input_file):
            with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5:
                for read in input_f5.get_reads():
                    compress_read_from_multi(output_f5, read, target_compression)
        else:
            with Fast5File(input_file, 'r') as input_f5, \
                    EmptyFast5(output_file, 'a') as output_f5:
                compress_read_from_single(output_f5, input_f5, target_compression)
    except Exception as e:
        # Error raised in Pool.aync will be lost so we explicitly print them.
        logging.exception(e)
        raise
Exemplo n.º 3
0
def is_multi_read(filepath):
    with MultiFast5File(filepath, mode='r') as fast5:
        if len(fast5.handle) == 0:
            # If there are no top-level groups we default to MultiRead
            return True
        if len(MultiFast5File(filepath, mode='r').get_read_ids()) != 0:
            # If there are any read_0123 groups we're definitely MultiRead
            return True
        if "UniqueGlobalKey" in fast5.handle:
            # This group indicates a single read
            return False
    raise TypeError(
        "Fast5 file type could not be identified as single- or multi-read. "
        "It should contain either 'UniqueGlobalKey' or 'read_' groups."
        "\nFilepath: {}".format(filepath))
Exemplo n.º 4
0
def extract_selected_reads(input_file, output_file, read_set, count):
    """
    Take reads from input file if read id is in read_set
    Write to output file, at most count times
    return tuple (found_reads, output_file, input_file)
    If input file was exhausted, the third item in return is None
    :param input_file:
    :param output_file:
    :param read_set:
    :param count:
    :return:
    """
    found_reads = set()
    with MultiFast5File(str(output_file), 'a') as output_f5:
        reads_present = set(output_f5.get_read_ids())
        for read, group in read_generator(input_file, read_set):
            found_reads.add(read)
            read_name = "read_" + read

            if read_name in reads_present:
                continue

            output_f5.handle.copy(group, read_name)
            reads_present.add(read)

            if len(found_reads) >= count:
                return found_reads, output_file, input_file

    return found_reads, output_file, None
def create_multi_read_file(input_files, output_file, target_compression):
    results = []
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    if os.path.exists(output_file):
        logger.info(
            "FileExists - appending new reads to existing file: {}".format(
                output_file))
    try:
        with MultiFast5File(output_file, 'a') as multi_f5:
            for filename in input_files:
                try:
                    with Fast5File(filename, 'r') as f5_input:
                        read = f5_input.get_read(f5_input.read_id)
                        multi_f5.add_existing_read(
                            read, target_compression=target_compression)
                    results.append(os.path.basename(filename))
                except Fast5FileTypeError as e:
                    logger.error(
                        "{}: Cannot input MultiRead files to single_to_multi: '{}'"
                        "".format(e, filename),
                        exc_info=exc_info)
                    raise
                except Exception as e:
                    logger.error(
                        "{}\n\tFailed to add single read file: '{}' to '{}'"
                        "".format(e, filename, output_file),
                        exc_info=exc_info)

    except Fast5FileTypeError:
        raise
    except Exception as e:
        logger.error("{}\n\tFailed to write to MultiRead file: {}"
                     "".format(e, output_file),
                     exc_info=exc_info)
    return results, output_file
Exemplo n.º 6
0
def create_multi_read_file(input_files, output_file):
    results = deque([os.path.basename(output_file)])
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    if os.path.exists(output_file):
        logger.info(
            "FileExists - appending new reads to existing file: {}".format(
                output_file))
    try:
        with MultiFast5File(output_file, 'a') as multi_f5:
            for filename in input_files:
                try:
                    with Fast5File(filename, 'r') as single_f5:
                        add_read_to_multi_fast5(multi_f5, single_f5)
                        results.append(os.path.basename(filename))
                except Exception as e:
                    logger.error(
                        "{}\n\tFailed to add single read file: '{}' to '{}'"
                        "".format(e, filename, output_file),
                        exc_info=exc_info)
    except Exception as e:
        logger.error("{}\n\tFailed to write to MultiRead file: {}"
                     "".format(e, output_file),
                     exc_info=exc_info)
    finally:
        return results
Exemplo n.º 7
0
def convert_multi_to_single(input_file, read_list, output_folder):
    '''
    Pull the exact read out of the file.
    '''
    results = [os.path.basename(input_file)]
    try:
        with MultiFast5File(input_file, 'r') as multi_f5:
            read_ids = set(multi_f5.get_read_ids())
            for query_read in read_list:
                if query_read in read_ids:
                    try:
                        read = multi_f5.get_read(query_read)
                        output_file = os.path.join(
                            output_folder, "{}.fast5".format(query_read))
                        create_single_f5(output_file, read)
                        results.append(os.path.basename(output_file))
                    except:
                        traceback.print_exc()
                        sys.stderr.write(
                            "{}\n\tFailed to copy read '{}' from {}\n".format(
                                "convert_multi_to_single", query_read,
                                input_file))
                else:
                    sys.stderr.write(
                        "{}\n\tFailed to find read '{}' in {}\n".format(
                            "convert_multi_to_single", query_read, input_file))
    except:
        traceback.print_exc()
        sys.stderr.write("{}\n\tFailed to copy files from: {}\n".format(
            "convert_multi_to_single", input_file))
    finally:
        return results
Exemplo n.º 8
0
    def test_extract_selected_reads(self):
        test_read_set = {item for item in self.read_set}  # copy to be modified

        # three test for count below, equaling and above number of read in input file
        for count in (1, 2, 3):
            temp_file_name = next(_get_candidate_names())
            found_reads, output_file, input_file = extract_selected_reads(input_file=self.multifast5,
                                                                          output_file=temp_file_name,
                                                                          count=count, read_set=self.read_set)
            if count < len(test_read_set):
                assert found_reads.issubset(test_read_set)
                assert input_file == self.multifast5
            elif count == len(test_read_set):
                assert found_reads == test_read_set
                assert input_file == self.multifast5
            elif count >= len(test_read_set):
                assert found_reads == test_read_set
                assert input_file is None

            assert output_file == temp_file_name
            # verify that resulting output file is a legal MultiFast5 with desired reads in it
            with MultiFast5File(output_file) as multi_file:
                readlist = multi_file.get_read_ids()
                assert len(readlist) > 0
                for read in readlist:
                    assert read in test_read_set

            unlink(temp_file_name)
Exemplo n.º 9
0
    def get_signal(
        self,
        start: int = None,
        end: int = None,
        scale: bool = False,
        window_size: int = None,
        window_step: int = None
    ) -> np.array:

        """ Scaled pA values (float32) or raw signal values (int16),
        return array of length 1 (1D) or array of length 2 (2D) """

        fast5: MultiFast5File = MultiFast5File(self.path)
        signal_read: Fast5Read = fast5.get_read(read_id=self.read_id)
        raw_signal: np.array = signal_read.get_raw_data(start=start, end=end, scale=scale)

        # Windows will only return full-sized windows,
        # incomplete windows at end of read are not included -
        # this is necessary for complete tensors in training and prediction:

        if window_size and window_step:
            return np.array(
                view_as_windows(raw_signal, window_shape=window_size, step=window_step)
            )
        else:
            return raw_signal
Exemplo n.º 10
0
    def test_real_example_file(self):
        with MultiFast5File(
                os.path.join(test_data, 'rle_basecall_table',
                             'rle_example.fast5'), 'r') as mf5:
            for read in mf5.get_reads():
                actual_data = read.handle[
                    'Analyses/Basecall_1D_000/BaseCalled_template/RunlengthBasecall']
                expected_dtypes = [
                    ('base', '<U1'),  # After cleaning this is a unicode string
                    ('scale', '<f4'),
                    ('shape', '<f4'),
                    ('weight', '<f4'),
                    ('index', '<u4'),
                    ('runlength', '<u4')
                ]

                for field, expected_type in expected_dtypes:
                    if field != 'base':
                        self.assertEqual(dtype(expected_type),
                                         actual_data[field].dtype)
                    else:
                        # Before cleaning the 'base' column is of type byte-string length=1
                        self.assertEqual(dtype('|S1'),
                                         actual_data[field].dtype)

                try:
                    clean_data = _sanitize_data_for_reading(actual_data)
                    self.assertEqual(dtype(expected_dtypes), clean_data.dtype)
                except UnicodeError:
                    if parse_version(h5py.__version__) < parse_version("2.7"):
                        # h5py==2.6 often fails to decode these arrays correctly
                        pass
                    else:
                        raise
Exemplo n.º 11
0
def get_fast5_fiveprime(read_id,
                        fast5_fns,
                        signal_size,
                        include_internal=False):
    '''
    Open fast5 file and return final signal_size measurements
    (corresponding to 5' end of an RNA signal). Signals
    are MAD scaled before the end is cropped.
    '''
    for fast5_fn in fast5_fns:
        with MultiFast5File(fast5_fn) as f5:
            try:
                read = f5.get_read(read_id)
            except KeyError as e:
                continue
            end = read.handle['Raw'].attrs['duration']
            signal = read.get_raw_data(scale=True, start=0, end=end)
            break
    else:
        return read_id, np.nan, np.empty(signal_size)
    signal = mad_scaling(signal)
    sig_len = len(signal)
    if sig_len >= signal_size:
        fiveprime = signal[sig_len - signal_size:]
    else:
        fiveprime = np.zeros(size)
        fiveprime[size - len(signal):] = signal
    if include_internal:
        internal = get_internal(signal, signal_size)
        return read_id, sig_len, fiveprime, internal
    return read_id, sig_len, fiveprime
Exemplo n.º 12
0
 def create_multi_file(self, read_ids):
     filename = self.generate_temp_filename()
     # driver=None is the default, but adding this in here makes sure we
     # preserve the constructor argument.
     with MultiFast5File(filename, 'w', driver=None) as multi_f5:
         for read_id in read_ids:
             multi_f5.create_empty_read(read_id, run_id)
     return filename
Exemplo n.º 13
0
 def __copy_reads_to__(self, read_ids, output):
     if not os.path.exists(output):
         os.makedirs(output)
     batch_id_files = [
         tuple([id] + re.split('(\.fast5|\.tar)\/', self.index_dict[id]))
         for id in read_ids if id in self.index_dict
     ]
     batch_id_files.sort(key=lambda x: (x[1], x[2]) if len(x) > 2 else x[1])
     for _, id_batch_paths in itertools.groupby(batch_id_files,
                                                key=lambda x: (x[1], x[2])
                                                if len(x) > 2 else x[1]):
         fofns = list(id_batch_paths)
         if len(fofns) == 1 and len(fofns[0]) == 2:
             # single read fast5
             id, src_file = fofns[0]
             shutil.copy(
                 os.path.join(os.path.dirname(args.index), src_file),
                 output)
         else:
             _, batch_file, batch_ext, _ = fofns[0]
             tarFiles = set([x[3] for x in fofns])
             # single read fast5 batch in tar archive
             if batch_ext == '.tar':
                 tar_file = os.path.join(os.path.dirname(self.index_file),
                                         batch_file + batch_ext)
                 with tarfile.open(tar_file) as fp_tar:
                     tar_members = fp_tar.getmembers()
                     for tar_member in tar_members:
                         if any(s in tar_member.name for s in tarFiles):
                             try:
                                 tar_member.name = os.path.basename(
                                     tar_member.name)
                                 fp_tar.extract(tar_member, path=output)
                             except:
                                 RuntimeError(
                                     '[ERROR] Could not extract {id} from {batch}.'
                                     .format(id=tar_member.name,
                                             batch=tar_file))
             elif batch_ext == '.fast5':
                 f5_file = os.path.join(os.path.dirname(self.index_file),
                                        batch_file + batch_ext)
                 with MultiFast5File(f5_file, 'r') as multi_f5:
                     target_ids = set([x[0] for x in fofns])
                     for read_id in multi_f5.get_read_ids():
                         if read_id in target_ids:
                             try:
                                 read = multi_f5.get_read(read_id)
                                 output_file = os.path.join(
                                     output, "{}.fast5".format(read_id))
                                 multi_to_single_fast5.create_single_f5(
                                     output_file, read)
                             except:
                                 RuntimeError(
                                     '[ERROR] Could not extract {id} from {batch}.'
                                     .format(id=read_id, batch=f5_file))
             else:
                 pass
Exemplo n.º 14
0
 def test_read_vbz_using_api(self):
     with MultiFast5File(
             os.path.join(test_data, 'vbz_reads', 'vbz_reads.fast5'),
             'r') as fast5:
         read_count = 0
         for read in fast5.get_reads():
             # This input file was created to have 4 reads with 20 samples per read
             read_count += 1
             raw_data = read.get_raw_data()
             self.assertEqual(20, len(raw_data))
         self.assertEqual(4, read_count)
Exemplo n.º 15
0
def batch_reverter(input_path,
                   output_folder,
                   filename_base,
                   batch_size,
                   threads,
                   recursive,
                   keys=set(
                       ('Raw', 'channel_id', 'context_tags', 'tracking_id'))):
    # make sure output dir doesn't exists
    if os.path.exists(output_folder):
        sys.stderr.write("Directory exists: %s\n" % output_folder)
        sys.exit(1)
    os.makedirs(output_folder)
    # get files to process - in revert order, since fail is typically before pass
    file_list = get_fast5_file_list(input_path, recursive)
    file_list = file_list[::-1]
    print("%s files to process..." % len(file_list))
    fi, ri = 0, -1
    for i, input_file in enumerate(file_list, 1):
        with MultiFast5File(input_file, 'r') as input_f5:
            for ri, read in enumerate(input_f5.get_read_ids(), ri + 1):
                if not ri % 100:
                    sys.stderr.write(" %s %s %s %s  \r" %
                                     (fi, ri, read, input_file))
                if not ri % batch_size:
                    output_f5 = MultiFast5File(
                        os.path.join(output_folder,
                                     "%s_%s.fast5" % (filename_base, fi)), 'w')
                    fi += 1
                # copy group to new file
                read_name = "read_" + read
                group = input_f5.handle[read_name]
                output_f5.handle.copy(group, read_name)
                # and remove additional info
                reverted_group = output_f5.handle[
                    read_name]  #; print(reverted_group.keys())
                for k in reverted_group.keys():
                    if k not in keys:
                        del reverted_group[k]
Exemplo n.º 16
0
def is_multi_read(filepath):
    """
    Determine if a file is a MultiFast5File, True if it is, False if it is a single Fast5File error for other types
    """
    with MultiFast5File(filepath, mode='r') as f5_file:
        file_type = check_file_type(f5_file)
        if file_type == MULTI_READ:
            return True
        elif file_type == SINGLE_READ:
            return False
        elif file_type == BULK_FAST5:
            raise NotImplementedError("ont_fast5_api does not support bulk fast files: {}".format(filepath))
        raise Fast5FileTypeError("Unknown file type: '{}' for file: {}".format(file_type, filepath))
Exemplo n.º 17
0
def compress_file(input_file, output_file, target_compression, sanitize=False):
    try:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        if is_multi_read(input_file):
            with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(
                    output_file, 'a') as output_f5:
                for read in input_f5.get_reads():
                    output_f5.add_existing_read(read,
                                                target_compression,
                                                sanitize=sanitize)
        else:
            with Fast5File(input_file, 'r') as input_f5, \
                    EmptyFast5(output_file, 'a') as output_f5:
                compress_single_read(output_f5,
                                     input_f5,
                                     target_compression,
                                     sanitize=sanitize)
    except Exception as e:
        # Error raised in Pool.async will be lost so we explicitly print them.
        logging.exception(e)
        raise
    return (input_file, output_file)
Exemplo n.º 18
0
    def test_compress_read_from_multi(self):
        target_compression = VBZ
        with get_fast5_file(os.path.join(test_data, "multi_read", "batch_0.fast5"), "r") as input_f5, \
                MultiFast5File(os.path.join(self.save_path, 'compress_multi_out.fast5'), 'w') as output_f5:
            read_id = input_f5.get_read_ids()[0]
            input_read = input_f5.get_read(read_id)

            # Input read should be uncompressed on the way in:
            self.assertUncompressed(input_read)

            compress_read_from_multi(output_f5, input_read, target_compression)

            output_read = output_f5.get_read(read_id)
            self.assertCompressed(output_read)
Exemplo n.º 19
0
    def test_add_read_from_multi(self):
        target_compression = VBZ
        with get_fast5_file(os.path.join(test_data, "multi_read", "batch_0.fast5"), "r") as input_f5, \
                MultiFast5File(self.generate_temp_filename(), 'w') as output_f5:
            read_id = input_f5.get_read_ids()[0]
            input_read = input_f5.get_read(read_id)

            # Input read should be uncompressed on the way in:
            self.assertEqual(check_read_compression(input_read), GZIP)

            output_f5.add_existing_read(input_read, target_compression)

            output_read = output_f5.get_read(read_id)
            self.assertEqual(check_read_compression(output_read), VBZ)
Exemplo n.º 20
0
 def test_write_vbz_using_api(self):
     input_data = list(range(5))
     read_id = "0a1b2c3d"
     with MultiFast5File(self.generate_temp_filename(), 'w') as fast5:
         fast5.create_empty_read(read_id, self.run_id)
         read = fast5.get_read(read_id)
         read.add_raw_data(input_data, attrs={}, compression=VBZ)
         raw = read.get_raw_data()
         # First check the data comes back in an appropriate form
         self.assertEqual(input_data, list(raw))
         # Then check the types are as they should be under the hood
         filters = read.raw_compression_filters
         self.assertTrue(str(VBZ.compression) in filters)
         self.assertEqual(VBZ.compression_opts,
                          filters[str(VBZ.compression)])
Exemplo n.º 21
0
    def test_add_analysis(self):
        f5_file = self.create_multi_file(generate_read_ids(4))
        group = "Test"
        component = "test_component"
        attrs = {"attribute": 1}

        # Fast5File.add_analysis includes the component name in the analysis attributes
        expected_attributes = attrs.copy()
        expected_attributes['component'] = component
        with MultiFast5File(f5_file, 'a') as multi_f5:
            read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
            self.assertEqual(read0.list_analyses(), [])
            read0.add_analysis(component, group, attrs)
            self.assertEqual(read0.list_analyses(), [(component, group)])
            self.assertEqual(read0.get_analysis_attributes(group),
                             expected_attributes)
Exemplo n.º 22
0
 def test_raw_data(self):
     f5_file = self.create_multi_file(generate_read_ids(4))
     data = list(range(10))
     raw_attrs = {
         "duration": 1,
         "median_before": 2.5,
         "read_id": "abcd",
         "read_number": 8,
         "start_mux": 2,
         "start_time": 99
     }
     with MultiFast5File(f5_file, 'a') as multi_f5:
         read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
         read0.add_raw_data(data, attrs=raw_attrs)
         output_data = read0.get_raw_data()
         numpy.testing.assert_array_equal(output_data, data)
Exemplo n.º 23
0
 def test_channel_info(self):
     f5_file = self.create_multi_file(generate_read_ids(4))
     channel_info = {
         "digitisation": 2048,
         "offset": -119.5,
         "range": 74.2,
         "sampling_rate": 4000,
         "channel_number": "72"
     }
     # Fast5File explicitly casts the channel number on reading
     expected_out = channel_info.copy()
     expected_out['channel_number'] = int(channel_info['channel_number'])
     with MultiFast5File(f5_file, 'a') as multi_f5:
         read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
         read0.add_channel_info(channel_info)
         output_data = read0.get_channel_info()
         self.assertEqual(output_data, expected_out)
Exemplo n.º 24
0
def read_generator(input_file, read_set):
    """
    Open input_file as Fast5, yield tuples (read_id, Group) for every read_id that is present in read_set
    :param input_file:
    :param read_set:
    :return:
    """

    with MultiFast5File(str(input_file), 'r') as input_f5:
        read_ids = input_f5.get_read_ids()
        if len(read_ids) == 0:
            if not is_multi_read(input_file):
                raise TypeError(
                    "Filtering from single-read Fast5 not supported")
        for read in read_set.intersection(read_ids):
            group = input_f5.handle["read_" + read]
            yield read, group
Exemplo n.º 25
0
    def test_multi_to_single(self):
        input_file = os.path.join(test_data, "multi_read", "batch_0.fast5")
        with MultiFast5File(input_file, 'r') as f5:
            read_count = len(f5.handle)
            expected_files = sorted([
                os.path.join(save_path, "{}", i + '.fast5')
                for i in f5.get_read_ids()
            ])

        subfolder = '0'
        convert_multi_to_single(input_file, save_path, subfolder)

        out_files = sorted(get_fast5_file_list(save_path, recursive=True))
        self.assertEqual(len(out_files), read_count)
        self.assertEqual(out_files,
                         [f.format(subfolder) for f in expected_files])

        # Small batch size should be split across multiple folders
        shutil.rmtree(save_path)
Exemplo n.º 26
0
    def test_subset_from_single(self, mock_log, mock_pbar):
        input_path = os.path.join(test_data, "single_reads")
        read_list = self._create_read_list_file(self.read_set)
        f5_filter = Fast5Filter(input_folder=input_path,
                                output_folder=self.save_path,
                                read_list_file=read_list)
        f5_filter.run_batch()

        count = 0
        with MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
            for input_file in os.listdir(input_path):
                with Fast5File(os.path.join(input_path, input_file), 'r') as input_f5:
                    read_id = input_f5.get_read_id()
                    if read_id in self.read_set:
                        read_in = input_f5.get_read(read_id)
                        read_out = output_f5.get_read(read_id)
                        self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
                        count += 1
        self.assertEqual(len(self.read_set), count)
Exemplo n.º 27
0
    def test_read_interface(self):
        read_ids = generate_read_ids(6)
        f5_file = self.create_multi_file(read_ids)

        with MultiFast5File(f5_file, 'a') as multi_f5:
            # Check we have the read_ids we expect
            self.assertEqual(sorted(read_ids), sorted(multi_f5.get_read_ids()))

            # Try and add another read with the same read_id and expect error
            with self.assertRaises(ValueError):
                multi_f5.create_empty_read(read_ids[0], run_id)

            # Test we can get a read from the file and it has the interface we expect
            read_0 = multi_f5.get_read(read_ids[0])
            self.assertTrue(isinstance(read_0, Fast5Read))

            # Test we cannot get a read which doesn't exit
            with self.assertRaises(KeyError):
                multi_f5.get_read("0123")
def try_multi_to_single_conversion(input_file, output_folder, subfolder):
    output_files = []
    with MultiFast5File(input_file, 'r') as multi_f5:
        file_type = check_file_type(multi_f5)
        if file_type != MULTI_READ:
            raise Fast5FileTypeError(
                "Could not convert Multi->Single for file type '{}' with path '{}'"
                "".format(file_type, input_file))
        for read in multi_f5.get_reads():
            try:
                output_file = os.path.join(output_folder, subfolder,
                                           "{}.fast5".format(read.read_id))
                create_single_f5(output_file, read)
                output_files.append(os.path.basename(output_file))
            except Exception as e:
                logger.error("{}\n\tFailed to copy read '{}' from {}"
                             "".format(str(e), read.read_id, input_file),
                             exc_info=exc_info)
    return output_files
Exemplo n.º 29
0
def convert_multi_to_single(input_file, output_folder, read_ids, subfolder):
    results = deque([os.path.basename(input_file)])
    try:
        with MultiFast5File(input_file, 'r') as multi_f5:
            for read_id in multi_f5.get_read_ids():
                if read_ids and read_id not in read_ids:
                    continue
                try:
                    read = multi_f5.get_read(read_id)
                    output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read_id))
                    create_single_f5(output_file, read)
                    results.append(os.path.basename(output_file))
                except Exception as e:
                    logger.error("{}\n\tFailed to copy read '{}' from {}"
                                 "".format(str(e), read_id, input_file), exc_info=exc_info)
    except Exception as e:
        logger.error("{}\n\tFailed to copy files from: {}"
                     "".format(e, input_file), exc_info=exc_info)
    finally:
        return results
Exemplo n.º 30
0
    def test_check_multi_read(self):
        input_folder = os.path.join(test_data, 'vbz_reads')
        ## expected results
        expected_results = []
        for input_file in os.listdir(input_folder):
            input_path = os.path.join(input_folder, input_file)
            with MultiFast5File(input_path, 'r') as f5:
                for read in f5.get_reads():
                    expected_results.append((VBZ, read.read_id, input_path))

        # Test check all reads True
        compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False,
                                                     check_all_reads=True))
        self.assertTrue(numpy.array_equal(expected_results, compression_results))

        ## check one read only
        compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False,
                                                     check_all_reads=False))
        self.assertTrue(len(compression_results) == len(os.listdir(input_folder)))
        self.assertTrue(compression_results[0] in expected_results)