def check_space_needed_and_free(run_id, type_file, run_factor, conf): """Compute free and needed space for type file and send warning mail if not enough. Arguments: run_id: the run id type_file: type file concerned run_factor: factor to estimate space needed by current run conf: configuration dictionary """ space_unit = int(conf[type_file + '.space.factor']) data_paths = conf[type_file + '.data.path'] for data_path in data_paths.split(':'): data_path = data_path.strip() space_needed = space_unit * run_factor space_free = common.df(data_path) # check if the remaining space on the directory is inferior at 5 percent space_remaining_not_enough = (space_free - space_needed) < (long(File(data_path).getTotalSpace()) * 0.05) # check if free space is available if (space_needed > space_free) or space_remaining_not_enough: error(run_id, type_file + ' files', space_needed, space_free, data_path, conf) else: log_message(run_id, type_file + ' files', space_needed, space_free, conf)
def __init__(self, name, vol_root) : self.vol_root = os.path.join(vol_root, '.volume') self.name = name self.volumes = {} vol = map(lambda b : os.path.join(self.vol_root, b, name), os.listdir(self.vol_root)) vol = filter(os.path.exists, vol) for v in vol : v = os.path.realpath(v) self.volumes[v] = common.df(v)
def mv(self, src) : size = common.getsize(src) ksort = collections.OrderedDict(sorted(self.volumes.items(), key=lambda t: t[1])) for k in ksort : vsize = self.volumes.get(k, 0) if vsize > size : common.mv(src, k) self.volumes[k] = common.df(k) return
def send_mail_if_critical_free_space_available(conf): """Check if disk free space is critical. If true send a mail. Arguments: conf: configuration dictionary """ for path in get_hiseq_data_paths(conf): if os.path.exists(path): df = common.df(path) free_space_threshold = long(conf[HISEQ_CRITICAL_MIN_SPACE_KEY]) if df < free_space_threshold: common.send_msg('[Aozan] Critical: Not enough disk space on sequencer storage for current run', 'There is only %.2f' % (df / (1024 * 1024 * 1024)) + ' Gb left for run storage in ' + path + '. ' + ' The current warning threshold is set to %.2f' % ( free_space_threshold / (1024 * 1024 * 1024)) + ' Gb.', False, conf)
def send_failed_run_message(run_id, secs, conf): """Send a mail to inform about a failed run. Arguments: conf: configuration dictionary """ run_path = hiseq_run.find_hiseq_run_path(run_id, conf) file_to_test = run_path + '/' + run_id + '/RTAComplete.txt' last = os.stat(file_to_test).st_mtime df = common.df(run_path) / (1024 * 1024 * 1024) du = common.du(run_path + '/' + run_id) / (1024 * 1024 * 1024) common.send_msg('[Aozan] Failed run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'A run (' + run_id + ') has failed on ' + common.get_instrument_name(run_id, conf) + ' at ' + common.time_to_human_readable(last) + '.\n' + 'Data for this run can be found at: ' + run_path + '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df), False, conf)
def send_mail_if_recent_run(run_id, secs, conf): """Send an email to inform that a new run is finished. Arguments: run_id: run id secs: maximum delay since the end of the run conf: configuration object """ run_path = hiseq_run.find_hiseq_run_path(run_id, conf) if run_path is False: return last = hiseq_run.check_end_run_since(run_id, secs, conf) if last > 0: df = common.df(run_path) / (1024 * 1024 * 1024) du = common.du(run_path + '/' + run_id) / (1024 * 1024 * 1024) common.send_msg('[Aozan] Ending run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'A new run (' + run_id + ') is finished on ' + common.get_instrument_name(run_id, conf) + ' at ' + common.time_to_human_readable( last) + '.\n' + 'Data for this run can be found at: ' + run_path + '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df), False, conf)
def demux(run_id, conf): """Add a processed run id to the list of the run ids. Arguments: run_id: The run id conf: configuration dictionary """ start_time = time.time() common.log('INFO', 'Demux step: Starting', conf) reports_data_base_path = conf[REPORTS_DATA_PATH_KEY] reports_data_path = common.get_report_run_data_path(run_id, conf) samplesheet_filename = build_samplesheet_filename(run_id, conf) bcl2fastq_samplesheet_path = conf[TMP_PATH_KEY] + '/' + samplesheet_filename + '.csv' input_run_data_path = common.get_input_run_data_path(run_id, conf) if input_run_data_path is None: return False fastq_output_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id basecall_stats_prefix = 'basecall_stats_' basecall_stats_file = basecall_stats_prefix + run_id + '.tar.bz2' # Check if root input bcl data directory exists if not os.path.exists(input_run_data_path): error("Basecalling data directory does not exist", "Basecalling data directory does not exist: " + str(input_run_data_path), conf) # return False # Check if root input fastq data directory exists if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf): error("FASTQ data directory does not exist", "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf) return False # Check if bcl2fastq samplesheets path exists if not common.is_dir_exists(BCL2FASTQ_SAMPLESHEETS_PATH_KEY, conf): error("Bcl2fastq samplesheet directory does not exist", "Bcl2fastq samplesheet directory does not exist: " + conf[BCL2FASTQ_SAMPLESHEETS_PATH_KEY], conf) return False # Check if bcl2fastq basedir path exists if not common.is_conf_value_equals_true(BCL2FASTQ_USE_DOCKER_KEY, conf): if not common.is_dir_exists(BCL2FASTQ_PATH_KEY, conf): error("Bcl2fastq directory does not exist", "Bcl2fastq directory does not exist: " + conf[BCL2FASTQ_PATH_KEY], conf) return False # Check if temporary directory exists if not common.is_dir_exists(TMP_PATH_KEY, conf): error("Temporary directory does not exist", "Temporary directory does not exist: " + conf[TMP_PATH_KEY], conf) return False # Check if reports_data_path exists if not os.path.exists(reports_data_base_path): error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf) return False # Create if not exist report directory for the run if not os.path.exists(reports_data_path): os.mkdir(reports_data_path) # Check if basecall stats archive exists if os.path.exists(reports_data_path + '/' + basecall_stats_file): error('Basecall stats archive already exists for run ' + run_id, 'Basecall stats archive already exists for run ' + run_id + ': ' + basecall_stats_file, conf) return False # Check if the output directory already exists if os.path.exists(fastq_output_dir): error("FASTQ output directory already exists for run " + run_id, 'FASTQ output directory already exists for run ' + run_id + ': ' + fastq_output_dir, conf) return False # Compute disk usage and disk free to check if enough disk space is available input_path_du = common.du(input_run_data_path) output_df = common.df(conf[FASTQ_DATA_PATH_KEY]) du_factor = float(conf[DEMUX_SPACE_FACTOR_KEY]) space_needed = input_path_du * du_factor common.log("WARNING", "Demux step: input disk usage: " + str(input_path_du), conf) common.log("WARNING", "Demux step: output disk free: " + str(output_df), conf) common.log("WARNING", "Demux step: space needed: " + str(space_needed), conf) common.log("CONFIG", "Bcl2fastq Docker mode: " + str( common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf)), conf) # Check if free space is available if output_df < space_needed: error("Not enough disk space to perform demultiplexing for run " + run_id, "Not enough disk space to perform demultiplexing for run " + run_id + '.\n%.2f Gb' % (space_needed / 1024 / 1024 / 1024) + ' is needed (factor x' + str( du_factor) + ') on ' + fastq_output_dir + '.', conf) return False # Load RunInfo object run_info = RunInfo.parse(input_run_data_path + '/RunInfo.xml') # Load samplesheet samplesheet, original_samplesheet_path = load_samplesheet(run_id, input_run_data_path, samplesheet_filename, conf) if samplesheet is None: return False # Update samplesheet if not update_samplesheet(samplesheet, run_id, run_info.getFlowCellLaneCount(), conf): return False # Check samplesheet check_result, samplesheet_warnings = check_samplesheet(samplesheet, run_id, run_info.getFlowCell(), conf) if not check_result: return False # Get the number of mismatches nb_mismatch = get_bcl2fastq_mismatches(samplesheet, conf[BCL2FASTQ_MISMATCHES_KEY]) # Write final samplesheet if not write_bcl2fastq_samplesheet(samplesheet, bcl2fastq_samplesheet_path, conf): return False # Run demultiplexing if common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf): # With image docker if not demux_run_with_docker(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path, nb_mismatch, conf): return False else: if not demux_run_standalone(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path, nb_mismatch, conf): return False # Check if the output directory has been created if not os.path.exists(fastq_output_dir): error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'Error while demultiplexing run ' + run_id + '.\n' + 'The output directory of bcl2fastq has been created: ' + fastq_output_dir, conf) return False # Check if the output directory has been created if os.path.isfile(fastq_output_dir): error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf), 'Error while demultiplexing run ' + run_id + '.\n' + 'The output directory of bcl2fastq is a file instead of a directory: ' + fastq_output_dir, conf) return False # Copy bcl2fastq log to output directory cmd = 'cp ' + quote(conf[TMP_PATH_KEY]) + '/bcl2fastq_output_' + run_id + '.* ' + quote(fastq_output_dir) common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while copying bcl2fastq log to the output FASTQ directory" + run_id_msg, 'Error while copying bcl2fastq log to the output FASTQ directory.\nCommand line:\n' + cmd, conf) return False # The output directory must be read only if not common.chmod_files_in_dir(fastq_output_dir, ".fastq", conf): error("Error while setting the output FASTQ directory to read only" + run_id_msg, 'Error while setting the output FASTQ directory to read only.\nCommand line:\n' + cmd, conf) return False if not check_if_output_fastq_files_exists(fastq_output_dir): error("Error with bcl2fastq execution for run " + run_id, "Error with bcl2fastq execution for run " + run_id + " no FASTQ file found in " + fastq_output_dir, conf) return False # Copy samplesheet to output directory cmd = 'cp -p ' + quote(bcl2fastq_samplesheet_path) + ' ' + quote(fastq_output_dir + '/SampleSheet.csv') common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while copying samplesheet file to FASTQ directory for run " + run_id, 'Error while copying samplesheet file to FASTQ directory.\nCommand line:\n' + cmd, conf) return False # Create archives on demultiplexing statistics if not archive_demux_stat(run_id, fastq_output_dir, reports_data_path, basecall_stats_file, basecall_stats_prefix, bcl2fastq_samplesheet_path, conf): return False # Archive samplesheet if not archive_samplesheet(run_id, original_samplesheet_path, bcl2fastq_samplesheet_path, conf): return False # Remove temporary samplesheet files if os.path.exists(bcl2fastq_samplesheet_path): os.remove(bcl2fastq_samplesheet_path) # Create index.hml file common.create_html_index_file(conf, run_id, [Settings.HISEQ_STEP_KEY, Settings.DEMUX_STEP_KEY]) df_in_bytes = common.df(fastq_output_dir) du_in_bytes = common.du(fastq_output_dir) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024 * 1024) common.log("WARNING", "Demux step: output disk free after demux: " + str(df_in_bytes), conf) common.log("WARNING", "Demux step: space used by demux: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending demultiplexing with ' + nb_mismatch + ' mismatch(es) for run ' + run_id + '.' + \ '\nJob finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \ 'FASTQ files for this run ' + \ 'can be found in the following directory:\n ' + fastq_output_dir if samplesheet_warnings.size() > 0: msg += '\n\nSamplesheet warnings:' for warn in samplesheet_warnings: msg += "\n - " + warn # Add path to report if reports.url exists if common.is_conf_key_exists(REPORTS_URL_KEY, conf): msg += '\n\nRun reports can be found at following location:\n ' + conf[REPORTS_URL_KEY] + '/' + run_id msg += '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df) common.send_msg('[Aozan] Ending demultiplexing for run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), msg, False, conf) common.log('INFO', 'Demux step: successful in ' + common.duration_to_human_readable(duration), conf) return True
def sync(run_id, conf): """Synchronize a run. Arguments: run_id: the run id conf: configuration dictionary """ start_time = time.time() common.log('INFO', 'Sync step: Starting', conf) bcl_data_path = conf[BCL_DATA_PATH_KEY] reports_data_base_path = conf[REPORTS_DATA_PATH_KEY] output_path = bcl_data_path + '/' + run_id # check if rsync exists in PATH if not common.exists_in_path("rsync"): error("Can't find all needed commands in PATH env var", "Can't find all needed commands in PATH env var. Unable to find: rsync command.", conf) return False # Check if reports_data_path exists if not os.path.exists(reports_data_base_path): error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf) return False # Check if enough space to store reports if common.df(reports_data_base_path) < 10 * 1024 * 1024 * 1024: error("Not enough disk space to store aozan reports for run " + run_id, "Not enough disk space to store aozan reports for run " + run_id + '.\nNeed more than 10 Gb on ' + reports_data_base_path + '.', conf) return False # Do the synchronization if not partial_sync(run_id, True, conf): return False # Rename partial sync directory to final run BCL directory if os.path.exists(output_path + '.tmp'): os.rename(output_path + '.tmp', output_path) # Check used and free space df_in_bytes = common.df(bcl_data_path) du_in_bytes = common.du(output_path) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024 * 1024) common.log("WARNING", "Sync step: output disk free after sync: " + str(df_in_bytes), conf) common.log("WARNING", "Sync step: space used by sync: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending synchronization for run ' + run_id + '.\n' + \ 'Job finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \ 'Run output files (without .cif files) can be found in the following directory:\n ' + output_path # Add path to report if reports.url exists if common.is_conf_key_exists(REPORTS_URL_KEY, conf): msg += '\n\nRun reports can be found at following location:\n ' + conf[REPORTS_URL_KEY] + '/' + run_id msg += '\n\nFor this task %.2f GB has been used and %.2f GB is still free.' % (du, df) common.send_msg('[Aozan] Ending synchronization for run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), msg, False, conf) common.log('INFO', 'sync step: successful in ' + common.duration_to_human_readable(duration), conf) return True
def partial_sync(run_id, last_sync, conf): """Partial synchronization of a run. Arguments: run_id: the run id last_sync: last synchronization conf: configuration dictionary """ hiseq_data_path = hiseq_run.find_hiseq_run_path(run_id, conf) bcl_data_path = conf[BCL_DATA_PATH_KEY] final_output_path = bcl_data_path + '/' + run_id # Check if hiseq_data_path exists if hiseq_data_path is False: error('Sequencer run data not found', 'Sequencer data for run ' + run_id + ' not found in sequencer directories (' + conf[HISEQ_DATA_PATH_KEY] + ')', conf) return False # Check if hiseq_data_path exists if not os.path.exists(hiseq_data_path): error("Sequencer directory does not exist", "Sequencer directory does not exist: " + hiseq_data_path, conf) return False # Check if bcl_data_path exists if not os.path.exists(bcl_data_path): error("Basecalling directory does not exist", "Basecalling directory does not exist: " + bcl_data_path, conf) return False # Check if final output path already exists if os.path.exists(final_output_path): error("Basecalling directory for run " + run_id + " already exists", "Basecalling directory for run " + run_id + " already exists: " + final_output_path, conf) return False input_path = hiseq_data_path + '/' + run_id output_path = bcl_data_path + '/' + run_id + '.tmp' # Create output path for run if not exist if not os.path.exists(output_path): os.mkdir(output_path) input_path_du = common.du(input_path) output_path_du = common.du(output_path) output_path_df = common.df(bcl_data_path) du_factor = float(conf[SYNC_SPACE_FACTOR_KEY]) space_needed = input_path_du * du_factor - output_path_du common.log("WARNING", "Sync step: input disk usage: " + str(input_path_du), conf) common.log("WARNING", "Sync step: output disk free: " + str(output_path_df), conf) common.log("WARNING", "Sync step: space needed: " + str(space_needed), conf) # Check if free space is available on if output_path_df < space_needed: error("Not enough disk space to perform synchronization for run " + run_id, "Not enough disk space to perform synchronization for run " + run_id + '.\n%.2f Gb' % (space_needed / 1024 / 1024 / 1024) + ' is needed (factor x' + str( du_factor) + ') on ' + bcl_data_path + '.', conf) return False # exclude CIF files ? # if common.is_conf_value_equals_true(SYNC_EXCLUDE_CIF_KEY, conf): # exclude_files = ['*.cif', '*_pos.txt', '*.errorMap', '*.FWHMMap'] # else: # exclude_files = [] # Extract exclude file from sequencer type and configuration exclude_files = get_exclude_files_list(run_id, conf) rsync_manifest_path = conf[TMP_PATH_KEY] + '/' + run_id + '.rsync.manifest' rsync_params = '' if last_sync: for exclude_file in exclude_files: rsync_params += " --exclude '" + exclude_file + "' " else: # Exclude files that will be rewritten severals times during the run exclude_files.extend(['*.bin', '*.txt', '*.xml']) cmd = 'cd ' + quote(input_path) + ' && find . -type f -mmin +' + conf[SYNC_CONTINUOUS_SYNC_MIN_AGE_FILES_KEY] for exclude_file in exclude_files: cmd += " -not -name '" + exclude_file + "' " cmd += ' > ' + quote(rsync_manifest_path) common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while executing rsync for run " + run_id, 'Error while executing find.\nCommand line:\n' + cmd, conf) return False rsync_params = '--files-from=' + quote(rsync_manifest_path) # Copy data from hiseq path to bcl path cmd = 'rsync -a --no-owner --no-group ' + rsync_params + ' ' + quote(input_path + '/') + ' ' + quote(output_path) common.log("INFO", "exec: " + cmd, conf) if os.system(cmd) != 0: error("Error while executing rsync for run " + run_id, 'Error while executing rsync.\nCommand line:\n' + cmd, conf) return False if not last_sync: os.remove(rsync_manifest_path) return True
def qc(run_id, conf): """Proceed to quality control of a run. Arguments: run_id: The run id conf: configuration dictionary """ start_time = time.time() input_run_data_path = common.get_input_run_data_path(run_id, conf) if input_run_data_path is None: return False fastq_input_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id reports_data_base_path = conf[REPORTS_DATA_PATH_KEY] reports_data_path = reports_data_base_path + '/' + run_id qc_output_dir = reports_data_path + '/qc_' + run_id tmp_extension = '.tmp' common.log('INFO', 'QC step: Starting', conf) # Check if input run data data exists if input_run_data_path is None: error("Basecalling data directory does not exist", "Basecalling data directory does not exist.", conf) return False # Check if input root fastq root data exists if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf): error("FASTQ data directory does not exist", "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf) return False # Create if not exist report directory for the run if not os.path.exists(reports_data_path): os.mkdir(reports_data_path) # Check if temporary directory exists if not common.is_dir_exists(TMP_PATH_KEY, conf): error("Temporary directory does not exist", "Temporary directory does not exist: " + conf[TMP_PATH_KEY], conf) return False # Check if the output directory already exists if os.path.exists(qc_output_dir): error("The quality control report directory already exists for run " + run_id, 'The quality control report directory already exists for run ' + run_id + ': ' + qc_output_dir, conf) return False # Check if the output directory already exists if os.path.exists(reports_data_path + '/qc_' + run_id + '.tar.bz2'): error("The quality control report archive already exists for run " + run_id, 'The quality control report archive already exists for run ' + run_id + ': ' + reports_data_path + '/qc_' + run_id + '.tar.bz2', conf) return False # Check if enough free space is available if common.df(conf[REPORTS_DATA_PATH_KEY]) < 1 * 1024 * 1024 * 1024: error("Not enough disk space to store aozan quality control for run " + run_id, "Not enough disk space to store aozan reports for run " + run_id + '.\nNeed more than 10 Gb on ' + conf[REPORTS_DATA_PATH_KEY] + '.', conf) return False # Create temporary temporary directory qc_output_dir += tmp_extension if not os.path.exists(qc_output_dir): os.mkdir(qc_output_dir) try: # Initialize the QC object qc = QC(Settings(conf), input_run_data_path, fastq_input_dir, qc_output_dir, conf[TMP_PATH_KEY], run_id) # Compute the report report = qc.computeReport() except AozanException, exp: error("Error while computing QC report for run " + run_id + ".", common.exception_msg(exp, conf), conf) return False
# Check if the report has been generated if not os.path.exists(html_report_file): error("Error while computing QC report for run " + run_id + ".", "No HTML report generated", conf) return False # The output directory must be read only if not common.chmod_files_in_dir(qc_output_dir, None, conf): error("Error while setting the output QC directory to read only for run " + run_id, 'Error while setting the output QC directory to read only.\nCommand line:\n' + cmd, conf) return False # Create index.hml file sessions = [Settings.HISEQ_STEP_KEY, Settings.DEMUX_STEP_KEY, Settings.QC_STEP_KEY] common.create_html_index_file(conf, run_id, sessions) df_in_bytes = common.df(qc_output_dir) du_in_bytes = common.du(qc_output_dir) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024) common.log("WARNING", "QC step: output disk free after QC: " + str(df_in_bytes), conf) common.log("WARNING", "QC step: space used by QC: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending quality control for run ' + run_id + '.' + \ '\nJob finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '. ' + \ 'You will find attached to this message the quality control report.\n\n' + \ 'QC files for this run ' + \ 'can be found in the following directory:\n ' + qc_output_dir
def recompress(run_id, conf): """Proceed to recompression of a run. Arguments: run_id: The run id conf: configuration dictionary """ common.log('INFO', 'Recompress step: Starting', conf) # Check if input root fastq root data exists if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf): error("FASTQ data directory does not exist", "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf) return False start_time = time.time() fastq_input_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id # initial du for comparing with ending disk usage previous_du_in_bytes = common.du(fastq_input_dir) # get information about compression type compression_type = conf[RECOMPRESS_COMPRESSION_KEY] compression_level = conf[RECOMPRESS_COMPRESSION_LEVEL_KEY] compression_info_tuple = get_info_from_file_type(compression_type, compression_level) if compression_info_tuple is None: error("Unknown compression type", "Unknown compression type: " + compression_type, conf) return False (compression_type_result, output_file_extension, output_compression_command, output_decompression_command, compression_level_argument) = compression_info_tuple # The following list contains the processed type of files to recompress types_to_recompress = ["fastq.gz", "fastq"] # list of program to check if exists in path before execution program_set = {"bash", "tee", "touch", "chmod", "md5sum", output_compression_command, output_decompression_command} # get list of file to process input_files = [] for extension in types_to_recompress: input_files.extend(list_files(fastq_input_dir, extension)) simple_extension = os.path.splitext(extension)[-1][1:] extension_info_tuple = get_info_from_file_type(simple_extension) if extension_info_tuple is None: error("Unknown extension type", "Unknown extension type: " + extension, conf) return False program_set.add(extension_info_tuple[3]) # actual program list check for program in program_set: if not common.exists_in_path(program): error("Can't find all needed commands in PATH env var", "Can't find all needed commands in PATH env var. Unable to find: " + program + " command.", conf) return False # Create executor and for parallelization of processus executor = Executors.newFixedThreadPool(int(conf[RECOMPRESS_THREADS_KEY])) workers = [] # process each fastq and fastq.gz recursively in each fastq directory for input_file in input_files: simple_extension = os.path.splitext(input_file)[-1][1:] # get info about the type of input file extension_info_tuple = get_info_from_file_type(simple_extension) if extension_info_tuple is None: error("Unknown extension type", "Unknown extension type: " + simple_extension, conf) return False input_decompression_command = extension_info_tuple[3] # get file base name and create output_file name, if file is already .fastq its ready to be base_input_file base_input_file = input_file[0: input_file.index(".fastq") + 6] output_file = base_input_file + "." + output_file_extension # Skip if the output_file already exists if not os.path.exists(output_file): # Create worker then execute in thread worker = Worker(input_file, output_file, input_decompression_command, output_compression_command, output_decompression_command, compression_level_argument, common.is_conf_value_equals_true(RECOMPRESS_DELETE_ORIGINAL_FASTQ_KEY, conf)) workers.append(worker) executor.execute(worker) else: common.log("WARNING", "Recompress step: Omitting processing file " + input_file + ". The associated output file " + output_file + " already exists.", conf) # Wait for all thread to finish executor.shutdown() while not executor.isTerminated(): time.sleep(1) # Check if any worker is in error for worker in workers: if not worker.is_successful(): error(worker.get_error_message(), worker.get_long_error_message(), conf) return False # check new disk usage df_in_bytes = common.df(fastq_input_dir) du_in_bytes = common.du(fastq_input_dir) previous_du = previous_du_in_bytes / (1024 * 1024) df = df_in_bytes / (1024 * 1024 * 1024) du = du_in_bytes / (1024 * 1024) common.log("WARNING", "Recompress step: output disk free after step: " + str(df_in_bytes), conf) common.log("WARNING", "Recompress step: space previously used: " + str(previous_du_in_bytes), conf) common.log("WARNING", "Recompress step: space now used by step: " + str(du_in_bytes), conf) duration = time.time() - start_time msg = 'Ending recompression for run ' + run_id + '.' + \ '\nJob finished at ' + common.time_to_human_readable(time.time()) + \ ' without error in ' + common.duration_to_human_readable(duration) + '. ' msg += '\n\nAfter recompress step FASTQ folder is now %.2f MB (previously %.2f MB) and %.2f GB still free.' % ( du, previous_du, df) common.send_msg('[Aozan] Ending recompress for run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf), msg, False, conf) common.log('INFO', 'Recompress step: successful in ' + common.duration_to_human_readable(duration), conf) return True