def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log): log.info("\n== Compressing corrected reads (with gzip)") to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): compressed_reads_filenames = [] for reads_file in value: if not os.path.isfile(reads_file): support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) to_compress.append(reads_file) compressed_reads_filenames.append(reads_file + ".gz") reads_library[key] = compressed_reads_filenames if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or ( gzip and os.path.isfile(dst_filename + '.gz')): support.warning( 'file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error( 'something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, '-f', '-7', '-p', str(max_threads), reads_file ], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or (gzip and os.path.isfile(dst_filename + '.gz')): support.warning('file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def compress_dataset_files(input_file, ext_python_modules_home, max_threads, log, not_used_yaml_file, output_dir, gzip_output): addsitedir(ext_python_modules_home) if sys.version.startswith("2."): import pyyaml2 as pyyaml from joblib2 import Parallel, delayed elif sys.version.startswith("3."): import pyyaml3 as pyyaml from joblib3 import Parallel, delayed dataset_data = pyyaml.load(open(input_file)) remove_not_corrected_reads(output_dir) is_changed = False if gzip_output: is_changed = True pigz_path = support.which("pigz") if pigz_path: compressor = "pigz" else: compressor = "gzip" log.info("\n== Compressing corrected reads (with %s)" % compressor) to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith("reads"): compressed_reads_filenames = [] for reads_file in value: compressed_reads_filenames.append(reads_file + ".gz") to_compress.append(reads_file) reads_library[key] = compressed_reads_filenames if len(to_compress): for reads_file in to_compress: if not isfile(reads_file): support.error( "something went wrong and file with corrected reads (%s) is missing!" % reads_file, log) if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, "-f", "-7", "-p", str(max_threads), reads_file ], log) else: n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(["gzip", "-f", "-7", reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output) if not_used_yaml_file != "": is_changed = True not_used_dataset_data = pyyaml.load(open(not_used_yaml_file)) dataset_data += not_used_dataset_data if is_changed: with open(input_file, 'w') as f: pyyaml.dump(dataset_data, f, default_flow_style=False, default_style='"', width=float("inf"))