Exemplo n.º 1
0
 def extract(self, input, output, format='single'):
     if not os.path.exists(output):
         os.makedirs(output)
     batch_name, batch_ext = os.path.splitext(input)
     # packed single reads in tar archive
     if batch_ext == '.tar':
         if format in ['single', 'lazy']:
             with tarfile.open(input) as fp_tar:
                 fp_tar.extractall(path=output)
         else:
             with tempfile.TemporaryDirectory(
                     prefix=self.tmp_prefix) as tmpdirname, tarfile.open(
                         input) as fp_tar:
                 fp_tar.extractall(path=tmpdirname)
                 f5files = [
                     os.path.join(dirpath, f)
                     for dirpath, _, files in os.walk(tmpdirname)
                     for f in files if f.endswith('.fast5')
                 ]
                 output_bulk_file = os.path.join(
                     output,
                     os.path.basename(batch_name) + '.fast5')
                 single_to_multi_fast5.create_multi_read_file(
                     f5files, output_bulk_file)
     # bulk fast5
     elif batch_ext == '.fast5':
         if format in ['bulk', 'lazy']:
             shutil.copy(input, output)
         else:
             multi_to_single_fast5.convert_multi_to_single(
                 input, output, '')
     # read IDs to be extracted
     elif batch_ext == '.txt':
         # load index and requested IDs
         if not self.index_dict:
             raise RuntimeError(
                 "[Error] Extraction of reads from IDs without index file provided."
             )
         with open(input, 'r') as fp:
             batch_ids = [id.strip() for id in fp.read().split('\n') if id]
         if format in ['single', 'lazy']:
             self.__copy_reads_to__(batch_ids, output)
         else:
             with tempfile.TemporaryDirectory(
                     prefix=self.tmp_prefix) as tmpdirname:
                 self.__copy_reads_to__(batch_ids, tmpdirname)
                 f5files = [
                     os.path.join(dirpath, f)
                     for dirpath, _, files in os.walk(tmpdirname)
                     for f in files if f.endswith('.fast5')
                 ]
                 output_bulk_file = os.path.join(
                     output,
                     os.path.basename(batch_name) + '.fast5')
                 single_to_multi_fast5.create_multi_read_file(
                     f5files, output_bulk_file)
     else:
         raise RuntimeError(
             '[ERROR] Raw fast5 batch extension {} not supported.'.format(
                 batch_ext))
Exemplo n.º 2
0
def check_file_type(myfile):
    fobj = fast5_interface.get_fast5_file(os.path.join(root, name))
    if fast5_interface.check_file_type(fobj) == "multi-read":
        #convert file to single fast5
        print("converting fast5 file****")
        multi_to_single_fast5.convert_multi_to_single(os.path.join(root, name),
                                                      directory, "single")
Exemplo n.º 3
0
def convert_fast5_type(directory):
    #go through fast5 files and check if the files is multi or single fast5 file
    #we need a single fast5 file
    for root, dirs, files in os.walk(directory):
        for name in files:
            if name.endswith(".fast5"):
                fobj = fast5_interface.get_fast5_file(os.path.join(root, name))
                if fast5_interface.check_file_type(fobj) == "multi-read":
                    #convert file to single fast5
                    print("converting fast5 file****")
                    multi_to_single_fast5.convert_multi_to_single(
                        os.path.join(root, name), directory, "single")
Exemplo n.º 4
0
    def test_multi_to_single(self):
        input_file = os.path.join(test_data, "multi_read", "batch_0.fast5")
        with MultiFast5File(input_file, 'r') as f5:
            read_count = len(f5.handle)
            expected_files = sorted([
                os.path.join(self.save_path, "{}", i + '.fast5')
                for i in f5.get_read_ids()
            ])

        subfolder = '0'
        convert_multi_to_single(input_file, self.save_path, subfolder)

        out_files = sorted(
            get_fast5_file_list(self.save_path,
                                recursive=True,
                                follow_symlinks=True))
        self.assertEqual(len(out_files), read_count)
        self.assertEqual(out_files,
                         [f.format(subfolder) for f in expected_files])
    def test_multi_to_single(self):
        input_file = os.path.join(test_data, "multi_read", "batch_0.fast5")
        with MultiFast5File(input_file, 'r') as f5:
            read_count = len(f5.handle)
            expected_files = sorted([
                os.path.join(save_path, "{}", i + '.fast5')
                for i in f5.get_read_ids()
            ])

        subfolder = '0'
        convert_multi_to_single(input_file, save_path, subfolder)

        out_files = sorted(get_fast5_file_list(save_path, recursive=True))
        self.assertEqual(len(out_files), read_count)
        self.assertEqual(out_files,
                         [f.format(subfolder) for f in expected_files])

        # Small batch size should be split across multiple folders
        shutil.rmtree(save_path)