def test_subset_from_multi(self, mock_log, mock_pbar): read_list = self._create_read_list_file(self.read_set) f5_filter = Fast5Filter(input_folder=os.path.dirname(self.input_multif5_path), output_folder=self.save_path, read_list_file=read_list) f5_filter.run_batch() with MultiFast5File(self.input_multif5_path, 'r') as input_f5, \ MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5: self.assertEqual(len(self.read_set), len(output_f5.get_read_ids())) for read_id in self.read_set: read_in = input_f5.get_read(read_id) read_out = output_f5.get_read(read_id) self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
def compress_file(input_file, output_file, target_compression): try: makedirs(os.path.dirname(output_file), exist_ok=True) if is_multi_read(input_file): with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5: for read in input_f5.get_reads(): compress_read_from_multi(output_f5, read, target_compression) else: with Fast5File(input_file, 'r') as input_f5, \ EmptyFast5(output_file, 'a') as output_f5: compress_read_from_single(output_f5, input_f5, target_compression) except Exception as e: # Error raised in Pool.aync will be lost so we explicitly print them. logging.exception(e) raise
def is_multi_read(filepath): with MultiFast5File(filepath, mode='r') as fast5: if len(fast5.handle) == 0: # If there are no top-level groups we default to MultiRead return True if len(MultiFast5File(filepath, mode='r').get_read_ids()) != 0: # If there are any read_0123 groups we're definitely MultiRead return True if "UniqueGlobalKey" in fast5.handle: # This group indicates a single read return False raise TypeError( "Fast5 file type could not be identified as single- or multi-read. " "It should contain either 'UniqueGlobalKey' or 'read_' groups." "\nFilepath: {}".format(filepath))
def extract_selected_reads(input_file, output_file, read_set, count): """ Take reads from input file if read id is in read_set Write to output file, at most count times return tuple (found_reads, output_file, input_file) If input file was exhausted, the third item in return is None :param input_file: :param output_file: :param read_set: :param count: :return: """ found_reads = set() with MultiFast5File(str(output_file), 'a') as output_f5: reads_present = set(output_f5.get_read_ids()) for read, group in read_generator(input_file, read_set): found_reads.add(read) read_name = "read_" + read if read_name in reads_present: continue output_f5.handle.copy(group, read_name) reads_present.add(read) if len(found_reads) >= count: return found_reads, output_file, input_file return found_reads, output_file, None
def create_multi_read_file(input_files, output_file, target_compression): results = [] os.makedirs(os.path.dirname(output_file), exist_ok=True) if os.path.exists(output_file): logger.info( "FileExists - appending new reads to existing file: {}".format( output_file)) try: with MultiFast5File(output_file, 'a') as multi_f5: for filename in input_files: try: with Fast5File(filename, 'r') as f5_input: read = f5_input.get_read(f5_input.read_id) multi_f5.add_existing_read( read, target_compression=target_compression) results.append(os.path.basename(filename)) except Fast5FileTypeError as e: logger.error( "{}: Cannot input MultiRead files to single_to_multi: '{}'" "".format(e, filename), exc_info=exc_info) raise except Exception as e: logger.error( "{}\n\tFailed to add single read file: '{}' to '{}'" "".format(e, filename, output_file), exc_info=exc_info) except Fast5FileTypeError: raise except Exception as e: logger.error("{}\n\tFailed to write to MultiRead file: {}" "".format(e, output_file), exc_info=exc_info) return results, output_file
def create_multi_read_file(input_files, output_file): results = deque([os.path.basename(output_file)]) if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) if os.path.exists(output_file): logger.info( "FileExists - appending new reads to existing file: {}".format( output_file)) try: with MultiFast5File(output_file, 'a') as multi_f5: for filename in input_files: try: with Fast5File(filename, 'r') as single_f5: add_read_to_multi_fast5(multi_f5, single_f5) results.append(os.path.basename(filename)) except Exception as e: logger.error( "{}\n\tFailed to add single read file: '{}' to '{}'" "".format(e, filename, output_file), exc_info=exc_info) except Exception as e: logger.error("{}\n\tFailed to write to MultiRead file: {}" "".format(e, output_file), exc_info=exc_info) finally: return results
def convert_multi_to_single(input_file, read_list, output_folder): ''' Pull the exact read out of the file. ''' results = [os.path.basename(input_file)] try: with MultiFast5File(input_file, 'r') as multi_f5: read_ids = set(multi_f5.get_read_ids()) for query_read in read_list: if query_read in read_ids: try: read = multi_f5.get_read(query_read) output_file = os.path.join( output_folder, "{}.fast5".format(query_read)) create_single_f5(output_file, read) results.append(os.path.basename(output_file)) except: traceback.print_exc() sys.stderr.write( "{}\n\tFailed to copy read '{}' from {}\n".format( "convert_multi_to_single", query_read, input_file)) else: sys.stderr.write( "{}\n\tFailed to find read '{}' in {}\n".format( "convert_multi_to_single", query_read, input_file)) except: traceback.print_exc() sys.stderr.write("{}\n\tFailed to copy files from: {}\n".format( "convert_multi_to_single", input_file)) finally: return results
def test_extract_selected_reads(self): test_read_set = {item for item in self.read_set} # copy to be modified # three test for count below, equaling and above number of read in input file for count in (1, 2, 3): temp_file_name = next(_get_candidate_names()) found_reads, output_file, input_file = extract_selected_reads(input_file=self.multifast5, output_file=temp_file_name, count=count, read_set=self.read_set) if count < len(test_read_set): assert found_reads.issubset(test_read_set) assert input_file == self.multifast5 elif count == len(test_read_set): assert found_reads == test_read_set assert input_file == self.multifast5 elif count >= len(test_read_set): assert found_reads == test_read_set assert input_file is None assert output_file == temp_file_name # verify that resulting output file is a legal MultiFast5 with desired reads in it with MultiFast5File(output_file) as multi_file: readlist = multi_file.get_read_ids() assert len(readlist) > 0 for read in readlist: assert read in test_read_set unlink(temp_file_name)
def get_signal( self, start: int = None, end: int = None, scale: bool = False, window_size: int = None, window_step: int = None ) -> np.array: """ Scaled pA values (float32) or raw signal values (int16), return array of length 1 (1D) or array of length 2 (2D) """ fast5: MultiFast5File = MultiFast5File(self.path) signal_read: Fast5Read = fast5.get_read(read_id=self.read_id) raw_signal: np.array = signal_read.get_raw_data(start=start, end=end, scale=scale) # Windows will only return full-sized windows, # incomplete windows at end of read are not included - # this is necessary for complete tensors in training and prediction: if window_size and window_step: return np.array( view_as_windows(raw_signal, window_shape=window_size, step=window_step) ) else: return raw_signal
def test_real_example_file(self): with MultiFast5File( os.path.join(test_data, 'rle_basecall_table', 'rle_example.fast5'), 'r') as mf5: for read in mf5.get_reads(): actual_data = read.handle[ 'Analyses/Basecall_1D_000/BaseCalled_template/RunlengthBasecall'] expected_dtypes = [ ('base', '<U1'), # After cleaning this is a unicode string ('scale', '<f4'), ('shape', '<f4'), ('weight', '<f4'), ('index', '<u4'), ('runlength', '<u4') ] for field, expected_type in expected_dtypes: if field != 'base': self.assertEqual(dtype(expected_type), actual_data[field].dtype) else: # Before cleaning the 'base' column is of type byte-string length=1 self.assertEqual(dtype('|S1'), actual_data[field].dtype) try: clean_data = _sanitize_data_for_reading(actual_data) self.assertEqual(dtype(expected_dtypes), clean_data.dtype) except UnicodeError: if parse_version(h5py.__version__) < parse_version("2.7"): # h5py==2.6 often fails to decode these arrays correctly pass else: raise
def get_fast5_fiveprime(read_id, fast5_fns, signal_size, include_internal=False): ''' Open fast5 file and return final signal_size measurements (corresponding to 5' end of an RNA signal). Signals are MAD scaled before the end is cropped. ''' for fast5_fn in fast5_fns: with MultiFast5File(fast5_fn) as f5: try: read = f5.get_read(read_id) except KeyError as e: continue end = read.handle['Raw'].attrs['duration'] signal = read.get_raw_data(scale=True, start=0, end=end) break else: return read_id, np.nan, np.empty(signal_size) signal = mad_scaling(signal) sig_len = len(signal) if sig_len >= signal_size: fiveprime = signal[sig_len - signal_size:] else: fiveprime = np.zeros(size) fiveprime[size - len(signal):] = signal if include_internal: internal = get_internal(signal, signal_size) return read_id, sig_len, fiveprime, internal return read_id, sig_len, fiveprime
def create_multi_file(self, read_ids): filename = self.generate_temp_filename() # driver=None is the default, but adding this in here makes sure we # preserve the constructor argument. with MultiFast5File(filename, 'w', driver=None) as multi_f5: for read_id in read_ids: multi_f5.create_empty_read(read_id, run_id) return filename
def __copy_reads_to__(self, read_ids, output): if not os.path.exists(output): os.makedirs(output) batch_id_files = [ tuple([id] + re.split('(\.fast5|\.tar)\/', self.index_dict[id])) for id in read_ids if id in self.index_dict ] batch_id_files.sort(key=lambda x: (x[1], x[2]) if len(x) > 2 else x[1]) for _, id_batch_paths in itertools.groupby(batch_id_files, key=lambda x: (x[1], x[2]) if len(x) > 2 else x[1]): fofns = list(id_batch_paths) if len(fofns) == 1 and len(fofns[0]) == 2: # single read fast5 id, src_file = fofns[0] shutil.copy( os.path.join(os.path.dirname(args.index), src_file), output) else: _, batch_file, batch_ext, _ = fofns[0] tarFiles = set([x[3] for x in fofns]) # single read fast5 batch in tar archive if batch_ext == '.tar': tar_file = os.path.join(os.path.dirname(self.index_file), batch_file + batch_ext) with tarfile.open(tar_file) as fp_tar: tar_members = fp_tar.getmembers() for tar_member in tar_members: if any(s in tar_member.name for s in tarFiles): try: tar_member.name = os.path.basename( tar_member.name) fp_tar.extract(tar_member, path=output) except: RuntimeError( '[ERROR] Could not extract {id} from {batch}.' .format(id=tar_member.name, batch=tar_file)) elif batch_ext == '.fast5': f5_file = os.path.join(os.path.dirname(self.index_file), batch_file + batch_ext) with MultiFast5File(f5_file, 'r') as multi_f5: target_ids = set([x[0] for x in fofns]) for read_id in multi_f5.get_read_ids(): if read_id in target_ids: try: read = multi_f5.get_read(read_id) output_file = os.path.join( output, "{}.fast5".format(read_id)) multi_to_single_fast5.create_single_f5( output_file, read) except: RuntimeError( '[ERROR] Could not extract {id} from {batch}.' .format(id=read_id, batch=f5_file)) else: pass
def test_read_vbz_using_api(self): with MultiFast5File( os.path.join(test_data, 'vbz_reads', 'vbz_reads.fast5'), 'r') as fast5: read_count = 0 for read in fast5.get_reads(): # This input file was created to have 4 reads with 20 samples per read read_count += 1 raw_data = read.get_raw_data() self.assertEqual(20, len(raw_data)) self.assertEqual(4, read_count)
def batch_reverter(input_path, output_folder, filename_base, batch_size, threads, recursive, keys=set( ('Raw', 'channel_id', 'context_tags', 'tracking_id'))): # make sure output dir doesn't exists if os.path.exists(output_folder): sys.stderr.write("Directory exists: %s\n" % output_folder) sys.exit(1) os.makedirs(output_folder) # get files to process - in revert order, since fail is typically before pass file_list = get_fast5_file_list(input_path, recursive) file_list = file_list[::-1] print("%s files to process..." % len(file_list)) fi, ri = 0, -1 for i, input_file in enumerate(file_list, 1): with MultiFast5File(input_file, 'r') as input_f5: for ri, read in enumerate(input_f5.get_read_ids(), ri + 1): if not ri % 100: sys.stderr.write(" %s %s %s %s \r" % (fi, ri, read, input_file)) if not ri % batch_size: output_f5 = MultiFast5File( os.path.join(output_folder, "%s_%s.fast5" % (filename_base, fi)), 'w') fi += 1 # copy group to new file read_name = "read_" + read group = input_f5.handle[read_name] output_f5.handle.copy(group, read_name) # and remove additional info reverted_group = output_f5.handle[ read_name] #; print(reverted_group.keys()) for k in reverted_group.keys(): if k not in keys: del reverted_group[k]
def is_multi_read(filepath): """ Determine if a file is a MultiFast5File, True if it is, False if it is a single Fast5File error for other types """ with MultiFast5File(filepath, mode='r') as f5_file: file_type = check_file_type(f5_file) if file_type == MULTI_READ: return True elif file_type == SINGLE_READ: return False elif file_type == BULK_FAST5: raise NotImplementedError("ont_fast5_api does not support bulk fast files: {}".format(filepath)) raise Fast5FileTypeError("Unknown file type: '{}' for file: {}".format(file_type, filepath))
def compress_file(input_file, output_file, target_compression, sanitize=False): try: os.makedirs(os.path.dirname(output_file), exist_ok=True) if is_multi_read(input_file): with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File( output_file, 'a') as output_f5: for read in input_f5.get_reads(): output_f5.add_existing_read(read, target_compression, sanitize=sanitize) else: with Fast5File(input_file, 'r') as input_f5, \ EmptyFast5(output_file, 'a') as output_f5: compress_single_read(output_f5, input_f5, target_compression, sanitize=sanitize) except Exception as e: # Error raised in Pool.async will be lost so we explicitly print them. logging.exception(e) raise return (input_file, output_file)
def test_compress_read_from_multi(self): target_compression = VBZ with get_fast5_file(os.path.join(test_data, "multi_read", "batch_0.fast5"), "r") as input_f5, \ MultiFast5File(os.path.join(self.save_path, 'compress_multi_out.fast5'), 'w') as output_f5: read_id = input_f5.get_read_ids()[0] input_read = input_f5.get_read(read_id) # Input read should be uncompressed on the way in: self.assertUncompressed(input_read) compress_read_from_multi(output_f5, input_read, target_compression) output_read = output_f5.get_read(read_id) self.assertCompressed(output_read)
def test_add_read_from_multi(self): target_compression = VBZ with get_fast5_file(os.path.join(test_data, "multi_read", "batch_0.fast5"), "r") as input_f5, \ MultiFast5File(self.generate_temp_filename(), 'w') as output_f5: read_id = input_f5.get_read_ids()[0] input_read = input_f5.get_read(read_id) # Input read should be uncompressed on the way in: self.assertEqual(check_read_compression(input_read), GZIP) output_f5.add_existing_read(input_read, target_compression) output_read = output_f5.get_read(read_id) self.assertEqual(check_read_compression(output_read), VBZ)
def test_write_vbz_using_api(self): input_data = list(range(5)) read_id = "0a1b2c3d" with MultiFast5File(self.generate_temp_filename(), 'w') as fast5: fast5.create_empty_read(read_id, self.run_id) read = fast5.get_read(read_id) read.add_raw_data(input_data, attrs={}, compression=VBZ) raw = read.get_raw_data() # First check the data comes back in an appropriate form self.assertEqual(input_data, list(raw)) # Then check the types are as they should be under the hood filters = read.raw_compression_filters self.assertTrue(str(VBZ.compression) in filters) self.assertEqual(VBZ.compression_opts, filters[str(VBZ.compression)])
def test_add_analysis(self): f5_file = self.create_multi_file(generate_read_ids(4)) group = "Test" component = "test_component" attrs = {"attribute": 1} # Fast5File.add_analysis includes the component name in the analysis attributes expected_attributes = attrs.copy() expected_attributes['component'] = component with MultiFast5File(f5_file, 'a') as multi_f5: read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) self.assertEqual(read0.list_analyses(), []) read0.add_analysis(component, group, attrs) self.assertEqual(read0.list_analyses(), [(component, group)]) self.assertEqual(read0.get_analysis_attributes(group), expected_attributes)
def test_raw_data(self): f5_file = self.create_multi_file(generate_read_ids(4)) data = list(range(10)) raw_attrs = { "duration": 1, "median_before": 2.5, "read_id": "abcd", "read_number": 8, "start_mux": 2, "start_time": 99 } with MultiFast5File(f5_file, 'a') as multi_f5: read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) read0.add_raw_data(data, attrs=raw_attrs) output_data = read0.get_raw_data() numpy.testing.assert_array_equal(output_data, data)
def test_channel_info(self): f5_file = self.create_multi_file(generate_read_ids(4)) channel_info = { "digitisation": 2048, "offset": -119.5, "range": 74.2, "sampling_rate": 4000, "channel_number": "72" } # Fast5File explicitly casts the channel number on reading expected_out = channel_info.copy() expected_out['channel_number'] = int(channel_info['channel_number']) with MultiFast5File(f5_file, 'a') as multi_f5: read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) read0.add_channel_info(channel_info) output_data = read0.get_channel_info() self.assertEqual(output_data, expected_out)
def read_generator(input_file, read_set): """ Open input_file as Fast5, yield tuples (read_id, Group) for every read_id that is present in read_set :param input_file: :param read_set: :return: """ with MultiFast5File(str(input_file), 'r') as input_f5: read_ids = input_f5.get_read_ids() if len(read_ids) == 0: if not is_multi_read(input_file): raise TypeError( "Filtering from single-read Fast5 not supported") for read in read_set.intersection(read_ids): group = input_f5.handle["read_" + read] yield read, group
def test_multi_to_single(self): input_file = os.path.join(test_data, "multi_read", "batch_0.fast5") with MultiFast5File(input_file, 'r') as f5: read_count = len(f5.handle) expected_files = sorted([ os.path.join(save_path, "{}", i + '.fast5') for i in f5.get_read_ids() ]) subfolder = '0' convert_multi_to_single(input_file, save_path, subfolder) out_files = sorted(get_fast5_file_list(save_path, recursive=True)) self.assertEqual(len(out_files), read_count) self.assertEqual(out_files, [f.format(subfolder) for f in expected_files]) # Small batch size should be split across multiple folders shutil.rmtree(save_path)
def test_subset_from_single(self, mock_log, mock_pbar): input_path = os.path.join(test_data, "single_reads") read_list = self._create_read_list_file(self.read_set) f5_filter = Fast5Filter(input_folder=input_path, output_folder=self.save_path, read_list_file=read_list) f5_filter.run_batch() count = 0 with MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5: for input_file in os.listdir(input_path): with Fast5File(os.path.join(input_path, input_file), 'r') as input_f5: read_id = input_f5.get_read_id() if read_id in self.read_set: read_in = input_f5.get_read(read_id) read_out = output_f5.get_read(read_id) self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data())) count += 1 self.assertEqual(len(self.read_set), count)
def test_read_interface(self): read_ids = generate_read_ids(6) f5_file = self.create_multi_file(read_ids) with MultiFast5File(f5_file, 'a') as multi_f5: # Check we have the read_ids we expect self.assertEqual(sorted(read_ids), sorted(multi_f5.get_read_ids())) # Try and add another read with the same read_id and expect error with self.assertRaises(ValueError): multi_f5.create_empty_read(read_ids[0], run_id) # Test we can get a read from the file and it has the interface we expect read_0 = multi_f5.get_read(read_ids[0]) self.assertTrue(isinstance(read_0, Fast5Read)) # Test we cannot get a read which doesn't exit with self.assertRaises(KeyError): multi_f5.get_read("0123")
def try_multi_to_single_conversion(input_file, output_folder, subfolder): output_files = [] with MultiFast5File(input_file, 'r') as multi_f5: file_type = check_file_type(multi_f5) if file_type != MULTI_READ: raise Fast5FileTypeError( "Could not convert Multi->Single for file type '{}' with path '{}'" "".format(file_type, input_file)) for read in multi_f5.get_reads(): try: output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read.read_id)) create_single_f5(output_file, read) output_files.append(os.path.basename(output_file)) except Exception as e: logger.error("{}\n\tFailed to copy read '{}' from {}" "".format(str(e), read.read_id, input_file), exc_info=exc_info) return output_files
def convert_multi_to_single(input_file, output_folder, read_ids, subfolder): results = deque([os.path.basename(input_file)]) try: with MultiFast5File(input_file, 'r') as multi_f5: for read_id in multi_f5.get_read_ids(): if read_ids and read_id not in read_ids: continue try: read = multi_f5.get_read(read_id) output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read_id)) create_single_f5(output_file, read) results.append(os.path.basename(output_file)) except Exception as e: logger.error("{}\n\tFailed to copy read '{}' from {}" "".format(str(e), read_id, input_file), exc_info=exc_info) except Exception as e: logger.error("{}\n\tFailed to copy files from: {}" "".format(e, input_file), exc_info=exc_info) finally: return results
def test_check_multi_read(self): input_folder = os.path.join(test_data, 'vbz_reads') ## expected results expected_results = [] for input_file in os.listdir(input_folder): input_path = os.path.join(input_folder, input_file) with MultiFast5File(input_path, 'r') as f5: for read in f5.get_reads(): expected_results.append((VBZ, read.read_id, input_path)) # Test check all reads True compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False, check_all_reads=True)) self.assertTrue(numpy.array_equal(expected_results, compression_results)) ## check one read only compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False, check_all_reads=False)) self.assertTrue(len(compression_results) == len(os.listdir(input_folder))) self.assertTrue(compression_results[0] in expected_results)