def discover_finished_runs(denied_runs, conf):
    """Discover new ended runs

    Arguments:
        conf: configuration object
    """

    run_ids_done = load_processed_run_ids(conf)

    if common.is_conf_value_equals_true(HISEQ_STEP_KEY, conf):
        for run_id in (get_available_finished_run_ids(conf) - run_ids_done - denied_runs):

            if run_id is None or len(run_id) == 0:
                # No run id found
                return []

            aozan.welcome(conf)
            common.log('INFO', 'Ending run detection ' + str(run_id) + ' on ' + common.get_instrument_name(run_id, conf),
                       conf)

            if hiseq_run.get_read_count(run_id, conf) == 0:
                send_failed_run_message(run_id, common.MAX_DELAY_TO_SEND_TERMINATED_RUN_EMAIL, conf)
                add_run_id_to_denied_run_ids(run_id, conf)
                create_run_summary_reports(run_id, conf)
            else:
                if create_run_summary_reports(run_id, conf):
                    send_mail_if_recent_run(run_id, common.MAX_DELAY_TO_SEND_TERMINATED_RUN_EMAIL, conf)
                    add_run_id_to_processed_run_ids(run_id, conf)
                    run_ids_done.add(run_id)

    return run_ids_done
def discover_new_runs(denied_runs, conf):
    """Discover new runs.

    Arguments:
        conf: configuration object
    """

    #
    # Discover new run
    #

    run_already_discovered = load_processed_run_ids(conf)

    if common.is_conf_value_equals_true(FIRST_BASE_REPORT_STEP_KEY, conf):
        for run_id in (get_available_new_run_ids(conf) - run_already_discovered - denied_runs):
            aozan.welcome(conf)
            common.log('INFO',
                       'First base report ' + run_id + ' on sequencer ' + common.get_instrument_name(run_id, conf),
                       conf)
            if send_report(run_id, conf):
                add_run_id_to_processed_run_ids(run_id, conf)
                run_already_discovered.add(run_id)

            # Verify space needed during the first base report
            estimate_space_needed.estimate(run_id, conf)
def send_failed_run_message(run_id, secs, conf):
    """Send a mail to inform about a failed run.

    Arguments:
        conf: configuration dictionary
    """

    run_path = hiseq_run.find_hiseq_run_path(run_id, conf)
    file_to_test = run_path + '/' + run_id + '/RTAComplete.txt'
    last = os.stat(file_to_test).st_mtime

    df = common.df(run_path) / (1024 * 1024 * 1024)
    du = common.du(run_path + '/' + run_id) / (1024 * 1024 * 1024)

    common.send_msg('[Aozan] Failed run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf),
                    'A run (' + run_id + ') has failed on ' + common.get_instrument_name(run_id, conf) +
                    ' at ' + common.time_to_human_readable(last) + '.\n' + 'Data for this run can be found at: ' +
                    run_path + '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df),
                    False, conf)
示例#4
0
def demux_run_standalone(run_id, input_run_data_path, fastq_output_dir, samplesheet_csv_path, nb_mismatch, conf):
    """ Demultiplexing the run with bcl2fastq on version parameter.

    Arguments:
        run_id: The run id
        input_run_data_path: input run data path to demultiplexing
        fastq_output_dir: fastq directory to save result on demultiplexing
        samplesheet_csv_path: samplesheet path in csv format, version used by bcl2fastq
        conf: configuration dictionary
    """

    bcl2fastq_executable_path = conf[BCL2FASTQ_PATH_KEY]
    tmp = conf[TMP_PATH_KEY]

    run_id_msg = " for run " + run_id + ' on ' + common.get_instrument_name(run_id, conf)
    bcl2fastq_log_file = tmp + "/bcl2fastq_output_" + run_id + ".err"

    # Check if the bcl2fastq path is OK
    if os.path.isdir(bcl2fastq_executable_path):
        bcl2fastq_executable_path += '/bcl2fastq'
    elif not os.path.isfile(bcl2fastq_executable_path):
        error("Error while setting executable command file bcl2fastq" + run_id_msg + ", invalid bcl2fastq path: " +
              bcl2fastq_executable_path, "Error while setting executable command file bcl2fastq" + run_id_msg +
              ", invalid bcl2fastq path: " + bcl2fastq_executable_path, conf)
        return False

    cmd = create_bcl2fastq_command_line(run_id, bcl2fastq_executable_path, input_run_data_path, fastq_output_dir,
                                        samplesheet_csv_path, tmp, nb_mismatch, conf)

    common.log('INFO', 'Demultiplexing in standalone mode using the following command line: ' + str(cmd), conf)

    exit_code = os.system(cmd)

    if exit_code != 0:
        error("Error while executing bcl2fastq " + run_id_msg,
              'Error while executing bcl2fastq (exit code: ' + str(
                  exit_code) + ').\nCommand line:\n' + cmd, conf)

        msg = 'Error while executing bcl2fastq ' + run_id_msg + ' (exit code: ' + str(
                  exit_code) + ').\nCommand line:\n' + cmd

        # Check if the log file has been generated
        if not os.path.exists(bcl2fastq_log_file):
            error("Error with bcl2fastq log for run " + run_id + ".", "No bcl2fastq log available", conf)
            common.send_msg('[Aozan] Failed demultiplexing ' + run_id_msg, msg, True, conf)
        else:
            msg += "\n\nPlease check the attached bcl2fastq output error file."
            common.send_msg_with_attachment('[Aozan] Failed demultiplexing ' + run_id_msg, msg, bcl2fastq_log_file, True, conf)

        return False

    return True
def send_mail_if_recent_run(run_id, secs, conf):
    """Send an email to inform that a new run is finished.

    Arguments:
        run_id: run id
        secs: maximum delay since the end of the run
        conf: configuration object
    """

    run_path = hiseq_run.find_hiseq_run_path(run_id, conf)
    if run_path is False:
        return

    last = hiseq_run.check_end_run_since(run_id, secs, conf)

    if last > 0:
        df = common.df(run_path) / (1024 * 1024 * 1024)
        du = common.du(run_path + '/' + run_id) / (1024 * 1024 * 1024)
        common.send_msg('[Aozan] Ending run ' + run_id + ' on ' + common.get_instrument_name(run_id, conf),
                        'A new run (' + run_id + ') is finished on ' +
                        common.get_instrument_name(run_id, conf) + ' at ' + common.time_to_human_readable(
                            last) + '.\n' +
                        'Data for this run can be found at: ' + run_path +
                        '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df), False, conf)
def error(run_id, type_file, space_needed, space_free, dir_path, conf):
    """Error handling.

    Arguments:
        run_id: the run id
        type_file: type file concerned
        space_needed: space needed for the run for a type of data
        space_free: space free for the run for a type of data
        dir_path: directory path
        conf: configuration dictionary
    """

    short_message = "not enough disk space to store " + type_file + " for run " + run_id + ' on ' + \
                    common.get_instrument_name(run_id, conf)
    message = type_file + ": not enough disk space to store files for run " + run_id + ' on ' + dir_path + '.'
    message = message + '%.2f GB' % (space_needed / 1024 / 1024 / 1024) + ' is needed by Aozan'
    message = message + ' However only %.2f GB' % (
        space_free / 1024 / 1024 / 1024) + ' of free space is currently available on this storage.'

    # send warning mail
    common.error('[Aozan] Estimation of space needed : ' + short_message, message,
                 conf[AOZAN_VAR_PATH_KEY] + '/space_estimated.lasterr', conf)
def send_report(run_id, conf):
    """Send a mail with the first base report.

    Arguments:
        run_id: the run id
        conf: configuration dictionary
    """

    #
    # Retrieve features the current run in RunInfos.xml file
    #

    run_info = hiseq_run.get_run_info(run_id, conf)

    if run_info is None:
        return False

    # TODO ?? add check sample-sheet if demux step enable
    # add warning in report if useful

    reads = run_info.getReads()
    error_cycles_per_read_not_indexes_count = 0
    reads_indexed_count = 0
    reads_not_indexed_count = 0
    cycles_count = 0
    cycles_per_read_not_indexed = 0

    for read in reads:
        cycles_count += read.getNumberCycles()
        if read.isIndexedRead():
            reads_indexed_count += 1
        else:
            reads_not_indexed_count += 1
            if cycles_per_read_not_indexed == 0:
                cycles_per_read_not_indexed = read.getNumberCycles()

            # Check same cycles count for each reads not indexed
            error_cycles_per_read_not_indexes_count = cycles_per_read_not_indexed != read.getNumberCycles()

    # Identification type run according to data in RunInfos.xml : SR or PE
    if reads_not_indexed_count == 1:
        type_run_estimated = "SR-" + str(cycles_per_read_not_indexed) + " with " + str(
            reads_indexed_count) + " index"
        if reads_indexed_count > 1:
            type_run_estimated += "es"
    elif reads_not_indexed_count == 2:
        type_run_estimated = "PE-" + str(cycles_per_read_not_indexed) + " with " + str(
            reads_indexed_count) + " index"
        if reads_indexed_count > 1:
            type_run_estimated += "es"
    else:
        type_run_estimated = "Undetermined run type (" + str(reads_not_indexed_count) + " reads with " + str(
            reads_indexed_count) + " index)"
        if reads_indexed_count > 1:
            type_run_estimated += "es"
        type_run_estimated += ")"

    description_run = "Informations about this run:\n"
    description_run += "\t- Sequencer: " + common.get_instrument_name(run_id, conf) + ".\n"
    description_run += "\t- " + str(run_info.getFlowCellLaneCount()) + " lanes with " + str(
        run_info.alignToPhix.size()) + " aligned to Phix.\n"
    description_run += "\t- " + str(reads_not_indexed_count) + " read"
    if reads_not_indexed_count > 1:
        description_run += "s"
    description_run += " and " + str(reads_indexed_count) + " index"
    if reads_indexed_count > 1:
        description_run += "es"
    description_run += ".\n"

    if error_cycles_per_read_not_indexes_count or cycles_per_read_not_indexed == 0:
        description_run += "\t- ERROR : cycles count per read different between reads (" + str(
            cycles_count) + " total cycles).\n"
    else:
        description_run += "\t- " + str(cycles_per_read_not_indexed) + " cycles per read (" + str(
            cycles_count) + " total cycles).\n"

    description_run += "\t- Estimated run type: " + type_run_estimated + ".\n"

    attachment_file = str(hiseq_run.find_hiseq_run_path(run_id, conf)) + '/' + run_id + '/' + common.FIRST_BASE_REPORT_FILE

    # If the First base report file exists, send it by email
    if common.is_file_readable(attachment_file):

        message = 'You will find attached to this message the first base report for the run ' + \
                  run_id + '.\n\n' + description_run
        common.send_msg_with_attachment('[Aozan] First base report for the run ' + type_run_estimated + '  ' + run_id +
                                        ' on ' + common.get_instrument_name(run_id, conf),
                                        message, attachment_file, False, conf)
    else:
        # With other no attachment file
        message = 'You will find below the parameters of the run ' + run_id + '.\n\n' + description_run
        common.send_msg('[Aozan] New run ' + type_run_estimated + ' ' + run_id + ' on ' +
                        common.get_instrument_name(run_id, conf), message,
                        False, conf)

    return True
示例#8
0
def demux(run_id, conf):
    """Add a processed run id to the list of the run ids.

    Arguments:
        run_id: The run id
        conf: configuration dictionary
    """

    start_time = time.time()
    common.log('INFO', 'Demux step: Starting', conf)

    reports_data_base_path = conf[REPORTS_DATA_PATH_KEY]
    reports_data_path = common.get_report_run_data_path(run_id, conf)

    samplesheet_filename = build_samplesheet_filename(run_id, conf)
    bcl2fastq_samplesheet_path = conf[TMP_PATH_KEY] + '/' + samplesheet_filename + '.csv'

    input_run_data_path = common.get_input_run_data_path(run_id, conf)

    if input_run_data_path is None:
        return False

    fastq_output_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id

    basecall_stats_prefix = 'basecall_stats_'
    basecall_stats_file = basecall_stats_prefix + run_id + '.tar.bz2'

    # Check if root input bcl data directory exists
    if not os.path.exists(input_run_data_path):
        error("Basecalling data directory does not exist",
              "Basecalling data directory does not exist: " + str(input_run_data_path), conf)
        # return False

    # Check if root input fastq data directory exists
    if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf):
        error("FASTQ data directory does not exist",
              "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf)
        return False

    # Check if bcl2fastq samplesheets path exists
    if not common.is_dir_exists(BCL2FASTQ_SAMPLESHEETS_PATH_KEY, conf):
        error("Bcl2fastq samplesheet directory does not exist",
              "Bcl2fastq samplesheet directory does not exist: " + conf[BCL2FASTQ_SAMPLESHEETS_PATH_KEY], conf)
        return False

    # Check if bcl2fastq basedir path exists
    if not common.is_conf_value_equals_true(BCL2FASTQ_USE_DOCKER_KEY, conf):
        if not common.is_dir_exists(BCL2FASTQ_PATH_KEY, conf):
            error("Bcl2fastq directory does not exist",
                  "Bcl2fastq directory does not exist: " + conf[BCL2FASTQ_PATH_KEY], conf)
            return False

    # Check if temporary directory exists
    if not common.is_dir_exists(TMP_PATH_KEY, conf):
        error("Temporary directory does not exist",
              "Temporary directory does not exist: " + conf[TMP_PATH_KEY], conf)
        return False

    # Check if reports_data_path exists
    if not os.path.exists(reports_data_base_path):
        error("Report directory does not exist",
              "Report directory does not exist: " + reports_data_base_path, conf)
        return False

    # Create if not exist report directory for the run
    if not os.path.exists(reports_data_path):
        os.mkdir(reports_data_path)

    # Check if basecall stats archive exists
    if os.path.exists(reports_data_path + '/' + basecall_stats_file):
        error('Basecall stats archive already exists for run ' + run_id,
              'Basecall stats archive already exists for run ' + run_id + ': ' + basecall_stats_file, conf)
        return False

    # Check if the output directory already exists
    if os.path.exists(fastq_output_dir):
        error("FASTQ output directory already exists for run " + run_id,
              'FASTQ output directory already exists for run ' + run_id + ': ' + fastq_output_dir, conf)
        return False

    # Compute disk usage and disk free to check if enough disk space is available
    input_path_du = common.du(input_run_data_path)
    output_df = common.df(conf[FASTQ_DATA_PATH_KEY])
    du_factor = float(conf[DEMUX_SPACE_FACTOR_KEY])
    space_needed = input_path_du * du_factor

    common.log("WARNING", "Demux step: input disk usage: " + str(input_path_du), conf)
    common.log("WARNING", "Demux step: output disk free: " + str(output_df), conf)
    common.log("WARNING", "Demux step: space needed: " + str(space_needed), conf)

    common.log("CONFIG", "Bcl2fastq Docker mode: " + str(
        common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf)), conf)

    # Check if free space is available
    if output_df < space_needed:
        error("Not enough disk space to perform demultiplexing for run " + run_id,
              "Not enough disk space to perform demultiplexing for run " + run_id +
              '.\n%.2f Gb' % (space_needed / 1024 / 1024 / 1024) + ' is needed (factor x' + str(
                  du_factor) + ') on ' + fastq_output_dir + '.', conf)
        return False

    # Load RunInfo object
    run_info = RunInfo.parse(input_run_data_path + '/RunInfo.xml')

    # Load samplesheet
    samplesheet, original_samplesheet_path = load_samplesheet(run_id, input_run_data_path, samplesheet_filename, conf)

    if samplesheet is None:
        return False

    # Update samplesheet
    if not update_samplesheet(samplesheet, run_id, run_info.getFlowCellLaneCount(), conf):
        return False

    # Check samplesheet
    check_result, samplesheet_warnings = check_samplesheet(samplesheet, run_id, run_info.getFlowCell(), conf)
    if not check_result:
        return False

    # Get the number of mismatches
    nb_mismatch = get_bcl2fastq_mismatches(samplesheet, conf[BCL2FASTQ_MISMATCHES_KEY])

    # Write final samplesheet
    if not write_bcl2fastq_samplesheet(samplesheet, bcl2fastq_samplesheet_path, conf):
        return False

    # Run demultiplexing
    if common.is_conf_value_equals_true(Settings.BCL2FASTQ_USE_DOCKER_KEY, conf):
        # With image docker
        if not demux_run_with_docker(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path,
                                     nb_mismatch, conf):
            return False
    else:
        if not demux_run_standalone(run_id, input_run_data_path, fastq_output_dir, bcl2fastq_samplesheet_path,
                                    nb_mismatch, conf):
            return False

    # Check if the output directory has been created
    if not os.path.exists(fastq_output_dir):
        error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf),
              'Error while demultiplexing run ' + run_id + '.\n' +
              'The output directory of bcl2fastq has been created: ' + fastq_output_dir, conf)
        return False

    # Check if the output directory has been created
    if os.path.isfile(fastq_output_dir):
        error("Error while demultiplexing run " + run_id + ' on ' + common.get_instrument_name(run_id, conf),
              'Error while demultiplexing run ' + run_id + '.\n' +
              'The output directory of bcl2fastq is a file instead of a directory: ' + fastq_output_dir, conf)
        return False

    # Copy bcl2fastq log to output directory
    cmd = 'cp ' + quote(conf[TMP_PATH_KEY]) + '/bcl2fastq_output_' + run_id + '.* ' + quote(fastq_output_dir)
    common.log("INFO", "exec: " + cmd, conf)
    if os.system(cmd) != 0:
        error("Error while copying bcl2fastq log to the output FASTQ directory" + run_id_msg,
              'Error while copying bcl2fastq log to the output FASTQ directory.\nCommand line:\n' + cmd, conf)
        return False

    # The output directory must be read only
    if not common.chmod_files_in_dir(fastq_output_dir, ".fastq", conf):
        error("Error while setting the output FASTQ directory to read only" + run_id_msg,
              'Error while setting the output FASTQ directory to read only.\nCommand line:\n' + cmd, conf)
        return False


    if not check_if_output_fastq_files_exists(fastq_output_dir):
        error("Error with bcl2fastq execution for run " + run_id,
              "Error with bcl2fastq execution for run " + run_id + " no FASTQ file found in " + fastq_output_dir,
              conf)
        return False

    # Copy samplesheet to output directory
    cmd = 'cp -p ' + quote(bcl2fastq_samplesheet_path) + ' ' + quote(fastq_output_dir + '/SampleSheet.csv')
    common.log("INFO", "exec: " + cmd, conf)
    if os.system(cmd) != 0:
        error("Error while copying samplesheet file to FASTQ directory for run " + run_id,
              'Error while copying samplesheet file to FASTQ directory.\nCommand line:\n' + cmd, conf)
        return False

    # Create archives on demultiplexing statistics
    if not archive_demux_stat(run_id, fastq_output_dir, reports_data_path, basecall_stats_file,
                              basecall_stats_prefix, bcl2fastq_samplesheet_path, conf):
        return False

    # Archive samplesheet
    if not archive_samplesheet(run_id, original_samplesheet_path, bcl2fastq_samplesheet_path, conf):
        return False

    # Remove temporary samplesheet files
    if os.path.exists(bcl2fastq_samplesheet_path):
        os.remove(bcl2fastq_samplesheet_path)

    # Create index.hml file
    common.create_html_index_file(conf, run_id, [Settings.HISEQ_STEP_KEY, Settings.DEMUX_STEP_KEY])

    df_in_bytes = common.df(fastq_output_dir)
    du_in_bytes = common.du(fastq_output_dir)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024 * 1024)

    common.log("WARNING", "Demux step: output disk free after demux: " + str(df_in_bytes), conf)
    common.log("WARNING", "Demux step: space used by demux: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending demultiplexing with ' + nb_mismatch + ' mismatch(es) for run ' + run_id + '.' + \
          '\nJob finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \
          'FASTQ files for this run ' + \
          'can be found in the following directory:\n  ' + fastq_output_dir

    if samplesheet_warnings.size() > 0:
        msg += '\n\nSamplesheet warnings:'
        for warn in samplesheet_warnings:
            msg += "\n  - " + warn

    # Add path to report if reports.url exists
    if common.is_conf_key_exists(REPORTS_URL_KEY, conf):
        msg += '\n\nRun reports can be found at following location:\n  ' + conf[REPORTS_URL_KEY] + '/' + run_id

    msg += '\n\nFor this task %.2f GB has been used and %.2f GB still free.' % (du, df)

    common.send_msg('[Aozan] Ending demultiplexing for run ' + run_id + ' on ' +
                    common.get_instrument_name(run_id, conf), msg, False, conf)
    common.log('INFO', 'Demux step: successful in ' + common.duration_to_human_readable(duration), conf)

    return True
示例#9
0
def demux_run_with_docker(run_id, input_run_data_path, fastq_output_dir, samplesheet_csv_path, nb_mismatch, conf):
    """ Demultiplexing the run with bcl2fastq on version parameter with image Docker.

    Arguments:
        run_id: The run id
        input_run_data_path: input run data path to demultiplexing
        fastq_output_dir: fastq directory to save result on demultiplexing
        samplesheet_csv_path: samplesheet path in csv format, version used by bcl2fastq
        conf: configuration dictionary
    """

    # In docker mount with input_run_data_path
    input_docker = '/data/input'
    input_run_data_path_in_docker = input_docker
    run_id_msg = " for run " + run_id + ' on ' + common.get_instrument_name(run_id, conf)

    # In docker mount with fastq_output_dir
    output_docker = '/data/output'
    fastq_data_path_in_docker = output_docker + '/' + os.path.basename(fastq_output_dir)

    tmp = conf[TMP_PATH_KEY]
    tmp_docker = '/tmp'

    bcl2fastq_log_file = tmp + "/bcl2fastq_output_" + run_id + ".err"
    samplesheet_csv_docker = tmp_docker + '/' + os.path.basename(samplesheet_csv_path)

    cmd = create_bcl2fastq_command_line(run_id, None, input_run_data_path_in_docker, fastq_data_path_in_docker,
                                        samplesheet_csv_docker, tmp_docker, nb_mismatch, conf)

    try:
        # Set working in docker on parent demultiplexing run directory.
        # Demultiplexing run directory will create by bcl2fastq
        docker = DockerCommand(conf[Settings.DOCKER_URI_KEY], ['/bin/bash', '-c', cmd], 'bcl2fastq2', common.BCL2FASTQ2_VERSION)

        common.log("CONFIG", "Demultiplexing using docker image from " + docker.getImageDockerName() +
                   " with command line " + cmd, conf)

        common.log("CONFIG", "Bcl2fastq docker mount: " +
                   str(os.path.dirname(fastq_output_dir)) + ":" + str(output_docker) + "; " +
                   input_run_data_path + ":" + input_docker + "; " + tmp + ":" + tmp_docker, conf)

        # Mount input directory
        docker.addMountDirectory(input_run_data_path, input_docker)
        docker.addMountDirectory(os.path.dirname(fastq_output_dir), output_docker)
        docker.addMountDirectory(tmp, tmp_docker)

        docker.run()
        exit_code = docker.getExitValue()

        if exit_code != 0:
            error("Error while demultiplexing run " + run_id, 'Error while demultiplexing run (exit code: ' +
                  str(exit_code) + ').\nCommand line:\n' + cmd, conf)

            msg = 'Error while executing bcl2fastq ' + run_id_msg + ' (exit code: ' + str(
                  exit_code) + ').\nCommand line:\n' + cmd

            # Check if the log file has been generated
            if not os.path.exists(bcl2fastq_log_file):
                error("Error with bcl2fastq log for run " + run_id + ".", "No bcl2fastq log available " + bcl2fastq_log_file, conf)
                common.send_msg('[Aozan] Failed demultiplexing ' + run_id_msg, msg, True, conf)
            else:
                msg += "\n\nPlease check the attached bcl2fastq output error file."
                common.send_msg_with_attachment('[Aozan] Failed demultiplexing ' + run_id_msg, msg, bcl2fastq_log_file, True, conf)

            return False

    except Throwable, exp:
        error("Error while running Docker image", common.exception_msg(exp, conf), conf)
        return False
示例#10
0
def sync(run_id, conf):
    """Synchronize a run.

    Arguments:
        run_id: the run id
        conf: configuration dictionary
    """

    start_time = time.time()
    common.log('INFO', 'Sync step: Starting', conf)

    bcl_data_path = conf[BCL_DATA_PATH_KEY]
    reports_data_base_path = conf[REPORTS_DATA_PATH_KEY]
    output_path = bcl_data_path + '/' + run_id

    # check if rsync exists in PATH
    if not common.exists_in_path("rsync"):
        error("Can't find all needed commands in PATH env var",
              "Can't find all needed commands in PATH env var. Unable to find: rsync command.", conf)
        return False

    # Check if reports_data_path exists
    if not os.path.exists(reports_data_base_path):
        error("Report directory does not exist", "Report directory does not exist: " + reports_data_base_path, conf)
        return False

    # Check if enough space to store reports
    if common.df(reports_data_base_path) < 10 * 1024 * 1024 * 1024:
        error("Not enough disk space to store aozan reports for run " + run_id,
              "Not enough disk space to store aozan reports for run " + run_id +
              '.\nNeed more than 10 Gb on ' + reports_data_base_path + '.', conf)
        return False

    # Do the synchronization
    if not partial_sync(run_id, True, conf):
        return False

    # Rename partial sync directory to final run BCL directory
    if os.path.exists(output_path + '.tmp'):
        os.rename(output_path + '.tmp', output_path)

    # Check used and free space
    df_in_bytes = common.df(bcl_data_path)
    du_in_bytes = common.du(output_path)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024 * 1024)

    common.log("WARNING", "Sync step: output disk free after sync: " + str(df_in_bytes), conf)
    common.log("WARNING", "Sync step: space used by sync: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending synchronization for run ' + run_id + '.\n' + \
          'Job finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '.\n\n' + \
          'Run output files (without .cif files) can be found in the following directory:\n  ' + output_path

    # Add path to report if reports.url exists
    if common.is_conf_key_exists(REPORTS_URL_KEY, conf):
        msg += '\n\nRun reports can be found at following location:\n  ' + conf[REPORTS_URL_KEY] + '/' + run_id

    msg += '\n\nFor this task %.2f GB has been used and %.2f GB is still free.' % (du, df)

    common.send_msg('[Aozan] Ending synchronization for run ' + run_id + ' on ' +
                    common.get_instrument_name(run_id, conf), msg, False, conf)
    common.log('INFO', 'sync step: successful in ' + common.duration_to_human_readable(duration), conf)
    return True
示例#11
0
    sessions = [Settings.HISEQ_STEP_KEY, Settings.DEMUX_STEP_KEY, Settings.QC_STEP_KEY]
    common.create_html_index_file(conf, run_id, sessions)

    df_in_bytes = common.df(qc_output_dir)
    du_in_bytes = common.du(qc_output_dir)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024)

    common.log("WARNING", "QC step: output disk free after QC: " + str(df_in_bytes), conf)
    common.log("WARNING", "QC step: space used by QC: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending quality control for run ' + run_id + '.' + \
          '\nJob finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '. ' + \
          'You will find attached to this message the quality control report.\n\n' + \
          'QC files for this run ' + \
          'can be found in the following directory:\n  ' + qc_output_dir

    # Add path to report if reports.url exists
    if common.is_conf_key_exists(REPORTS_URL_KEY, conf):
        msg += '\n\nRun reports can be found at following location:\n  ' + conf[REPORTS_URL_KEY] + '/' + run_id

    msg += '\n\nFor this task %.2f MB has been used and %.2f GB still free.' % (du, df)

    common.send_msg_with_attachment('[Aozan] Ending quality control for run ' + run_id + ' on ' +
                                    common.get_instrument_name(run_id, conf), msg, html_report_file, False, conf)
    common.log('INFO', 'QC step: successful in ' + common.duration_to_human_readable(duration), conf)
    return True
示例#12
0
def recompress(run_id, conf):
    """Proceed to recompression of a run.

    Arguments:
        run_id: The run id
        conf: configuration dictionary
    """

    common.log('INFO', 'Recompress step: Starting', conf)

    # Check if input root fastq root data exists
    if not common.is_dir_exists(FASTQ_DATA_PATH_KEY, conf):
        error("FASTQ data directory does not exist",
              "FASTQ data directory does not exist: " + conf[FASTQ_DATA_PATH_KEY], conf)
        return False

    start_time = time.time()
    fastq_input_dir = conf[FASTQ_DATA_PATH_KEY] + '/' + run_id

    # initial du for comparing with ending disk usage
    previous_du_in_bytes = common.du(fastq_input_dir)

    # get information about compression type
    compression_type = conf[RECOMPRESS_COMPRESSION_KEY]
    compression_level = conf[RECOMPRESS_COMPRESSION_LEVEL_KEY]
    compression_info_tuple = get_info_from_file_type(compression_type, compression_level)

    if compression_info_tuple is None:
        error("Unknown compression type",
              "Unknown compression type: " + compression_type, conf)
        return False

    (compression_type_result, output_file_extension, output_compression_command, output_decompression_command,
     compression_level_argument) = compression_info_tuple

    # The following list contains the processed type of files to recompress
    types_to_recompress = ["fastq.gz", "fastq"]

    # list of program to check if exists in path before execution
    program_set = {"bash", "tee", "touch", "chmod", "md5sum", output_compression_command, output_decompression_command}

    # get list of file to process
    input_files = []
    for extension in types_to_recompress:

        input_files.extend(list_files(fastq_input_dir, extension))
        simple_extension = os.path.splitext(extension)[-1][1:]
        extension_info_tuple = get_info_from_file_type(simple_extension)

        if extension_info_tuple is None:
            error("Unknown extension type",
                  "Unknown extension type: " + extension, conf)
            return False

        program_set.add(extension_info_tuple[3])

    # actual program list check
    for program in program_set:
        if not common.exists_in_path(program):
            error("Can't find all needed commands in PATH env var",
                  "Can't find all needed commands in PATH env var. Unable to find: " + program + " command.", conf)
            return False

    # Create executor and for parallelization of processus
    executor = Executors.newFixedThreadPool(int(conf[RECOMPRESS_THREADS_KEY]))
    workers = []

    # process each fastq and fastq.gz recursively in each fastq directory
    for input_file in input_files:

        simple_extension = os.path.splitext(input_file)[-1][1:]

        # get info about the type of input file
        extension_info_tuple = get_info_from_file_type(simple_extension)
        if extension_info_tuple is None:
            error("Unknown extension type",
                  "Unknown extension type: " + simple_extension, conf)
            return False

        input_decompression_command = extension_info_tuple[3]

        # get file base name and create output_file name, if file is already .fastq its ready to be base_input_file
        base_input_file = input_file[0: input_file.index(".fastq") + 6]
        output_file = base_input_file + "." + output_file_extension

        # Skip if the output_file already exists
        if not os.path.exists(output_file):

            # Create worker then execute in thread
            worker = Worker(input_file, output_file, input_decompression_command, output_compression_command,
                            output_decompression_command,
                            compression_level_argument,
                            common.is_conf_value_equals_true(RECOMPRESS_DELETE_ORIGINAL_FASTQ_KEY, conf))
            workers.append(worker)
            executor.execute(worker)

        else:
            common.log("WARNING", "Recompress step: Omitting processing file " + input_file + ". The associated output file " + output_file + " already exists.", conf)

    # Wait for all thread to finish
    executor.shutdown()
    while not executor.isTerminated():
        time.sleep(1)

    # Check if any worker is in error
    for worker in workers:
        if not worker.is_successful():
            error(worker.get_error_message(),
                  worker.get_long_error_message(), conf)
            return False

    # check new disk usage
    df_in_bytes = common.df(fastq_input_dir)
    du_in_bytes = common.du(fastq_input_dir)
    previous_du = previous_du_in_bytes / (1024 * 1024)
    df = df_in_bytes / (1024 * 1024 * 1024)
    du = du_in_bytes / (1024 * 1024)

    common.log("WARNING", "Recompress step: output disk free after step: " + str(df_in_bytes), conf)
    common.log("WARNING", "Recompress step: space previously used: " + str(previous_du_in_bytes), conf)
    common.log("WARNING", "Recompress step: space now used by step: " + str(du_in_bytes), conf)

    duration = time.time() - start_time

    msg = 'Ending recompression for run ' + run_id + '.' + \
          '\nJob finished at ' + common.time_to_human_readable(time.time()) + \
          ' without error in ' + common.duration_to_human_readable(duration) + '. '

    msg += '\n\nAfter recompress step FASTQ folder is now %.2f MB (previously %.2f MB) and %.2f GB still free.' % (
        du, previous_du, df)

    common.send_msg('[Aozan] Ending recompress for run ' + run_id + ' on ' +
                    common.get_instrument_name(run_id, conf), msg, False, conf)
    common.log('INFO', 'Recompress step: successful in ' + common.duration_to_human_readable(duration), conf)
    return True