def compress_file(input_file, output_file, target_compression): try: makedirs(os.path.dirname(output_file), exist_ok=True) if is_multi_read(input_file): with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5: for read in input_f5.get_reads(): compress_read_from_multi(output_f5, read, target_compression) else: with Fast5File(input_file, 'r') as input_f5, \ EmptyFast5(output_file, 'a') as output_f5: compress_read_from_single(output_f5, input_f5, target_compression) except Exception as e: # Error raised in Pool.aync will be lost so we explicitly print them. logging.exception(e) raise
def read_generator(input_file, read_set): """ Open input_file as Fast5, yield tuples (read_id, Group) for every read_id that is present in read_set :param input_file: :param read_set: :return: """ with MultiFast5File(str(input_file), 'r') as input_f5: read_ids = input_f5.get_read_ids() if len(read_ids) == 0: if not is_multi_read(input_file): raise TypeError( "Filtering from single-read Fast5 not supported") for read in read_set.intersection(read_ids): group = input_f5.handle["read_" + read] yield read, group
def compress_file(input_file, output_file, target_compression, sanitize=False): try: os.makedirs(os.path.dirname(output_file), exist_ok=True) if is_multi_read(input_file): with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File( output_file, 'a') as output_f5: for read in input_f5.get_reads(): output_f5.add_existing_read(read, target_compression, sanitize=sanitize) else: with Fast5File(input_file, 'r') as input_f5, \ EmptyFast5(output_file, 'a') as output_f5: compress_single_read(output_f5, input_f5, target_compression, sanitize=sanitize) except Exception as e: # Error raised in Pool.async will be lost so we explicitly print them. logging.exception(e) raise return (input_file, output_file)
def index(input, recursive=False, output_prefix="", tmp_prefix=None): if tmp_prefix and not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) input_files = [] # scan input if os.path.isfile(input): input_files.append(input) else: if recursive: input_files.extend([ os.path.join(dirpath, f) for dirpath, _, files in os.walk(input) for f in files if f.endswith('.fast5') or f.endswith('.tar') ]) else: input_files.extend(glob.glob(os.path.join(input, '*.fast5'))) input_files.extend(glob.glob(os.path.join(input, '*.tar'))) # index all provided files for input_file in input_files: input_relative = os.path.normpath( os.path.join( output_prefix, os.path.dirname(os.path.relpath(input_file, start=input)), os.path.basename(input_file))) # extract reads from packed tar archive and retrieve read IDs if input_file.endswith('.tar'): with tempfile.TemporaryDirectory( prefix=tmp_prefix) as tmpdirname, tarfile.open( input_file) as fp_tar: fp_tar.extractall(path=tmpdirname) f5files = [ os.path.join(dirpath, f) for dirpath, _, files in os.walk(tmpdirname) for f in files if f.endswith('.fast5') ] for f5file in f5files: try: ID = fast5_Index.__get_ID_single__(f5file) print("\t".join([ os.path.normpath( os.path.join( input_relative, os.path.relpath(f5file, start=tmpdirname))), ID ])) except: print( "[ERROR] Failed to open {f5}, skip file for indexing" .format(f5=f5file), file=sys.stderr) # bulk and single read fast5 else: if is_multi_read(input_file): reads = fast5_Index.__get_ID_multi__(input_file) for f, (group, ID) in zip([input_relative] * len(reads), reads): yield '\t'.join((os.path.join(f, group), ID)) else: try: ID = fast5_Index.__get_ID_single__(input_relative) except: print( "[ERROR] Failed to open {f5}, skip file for indexing" .format(f5=input_file), file=sys.stderr) yield '\t'.join([input_relative, ID])