def extract(self, input, output, format='single'): if not os.path.exists(output): os.makedirs(output) batch_name, batch_ext = os.path.splitext(input) # packed single reads in tar archive if batch_ext == '.tar': if format in ['single', 'lazy']: with tarfile.open(input) as fp_tar: fp_tar.extractall(path=output) else: with tempfile.TemporaryDirectory( prefix=self.tmp_prefix) as tmpdirname, tarfile.open( input) as fp_tar: fp_tar.extractall(path=tmpdirname) f5files = [ os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5') ] output_bulk_file = os.path.join( output, os.path.basename(batch_name) + '.fast5') single_to_multi_fast5.create_multi_read_file( f5files, output_bulk_file) # bulk fast5 elif batch_ext == '.fast5': if format in ['bulk', 'lazy']: shutil.copy(input, output) else: multi_to_single_fast5.convert_multi_to_single( input, output, '') # read IDs to be extracted elif batch_ext == '.txt': # load index and requested IDs if not self.index_dict: raise RuntimeError( "[Error] Extraction of reads from IDs without index file provided." ) with open(input, 'r') as fp: batch_ids = [id.strip() for id in fp.read().split('\n') if id] if format in ['single', 'lazy']: self.__copy_reads_to__(batch_ids, output) else: with tempfile.TemporaryDirectory( prefix=self.tmp_prefix) as tmpdirname: self.__copy_reads_to__(batch_ids, tmpdirname) f5files = [ os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5') ] output_bulk_file = os.path.join( output, os.path.basename(batch_name) + '.fast5') single_to_multi_fast5.create_multi_read_file( f5files, output_bulk_file) else: raise RuntimeError( '[ERROR] Raw fast5 batch extension {} not supported.'.format( batch_ext))
def check_file_type(myfile): fobj = fast5_interface.get_fast5_file(os.path.join(root, name)) if fast5_interface.check_file_type(fobj) == "multi-read": #convert file to single fast5 print("converting fast5 file****") multi_to_single_fast5.convert_multi_to_single(os.path.join(root, name), directory, "single")
def convert_fast5_type(directory): #go through fast5 files and check if the files is multi or single fast5 file #we need a single fast5 file for root, dirs, files in os.walk(directory): for name in files: if name.endswith(".fast5"): fobj = fast5_interface.get_fast5_file(os.path.join(root, name)) if fast5_interface.check_file_type(fobj) == "multi-read": #convert file to single fast5 print("converting fast5 file****") multi_to_single_fast5.convert_multi_to_single( os.path.join(root, name), directory, "single")
def test_multi_to_single(self): input_file = os.path.join(test_data, "multi_read", "batch_0.fast5") with MultiFast5File(input_file, 'r') as f5: read_count = len(f5.handle) expected_files = sorted([ os.path.join(self.save_path, "{}", i + '.fast5') for i in f5.get_read_ids() ]) subfolder = '0' convert_multi_to_single(input_file, self.save_path, subfolder) out_files = sorted( get_fast5_file_list(self.save_path, recursive=True, follow_symlinks=True)) self.assertEqual(len(out_files), read_count) self.assertEqual(out_files, [f.format(subfolder) for f in expected_files])
def test_multi_to_single(self): input_file = os.path.join(test_data, "multi_read", "batch_0.fast5") with MultiFast5File(input_file, 'r') as f5: read_count = len(f5.handle) expected_files = sorted([ os.path.join(save_path, "{}", i + '.fast5') for i in f5.get_read_ids() ]) subfolder = '0' convert_multi_to_single(input_file, save_path, subfolder) out_files = sorted(get_fast5_file_list(save_path, recursive=True)) self.assertEqual(len(out_files), read_count) self.assertEqual(out_files, [f.format(subfolder) for f in expected_files]) # Small batch size should be split across multiple folders shutil.rmtree(save_path)