Пример #1
0
def _get_input_files(samples, base_dir, tx_out_dir):
    """Retrieve input files, keyed by sample and QC method name.

    Stages files into the work directory to ensure correct names for
    MultiQC sample assessment when running with CWL.
    """
    in_files = collections.defaultdict(list)
    for data in samples:
        sum_qc = tz.get_in(["summary", "qc"], data, {})
        if sum_qc in [None, "None"]:
            sum_qc = {}
        elif isinstance(sum_qc, six.string_types):
            sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc}
        elif not isinstance(sum_qc, dict):
            raise ValueError("Unexpected summary qc: %s" % sum_qc)
        for program, pfiles in sum_qc.items():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles.get("secondary", [])
            # CWL: presents output files as single file plus associated secondary files
            elif isinstance(pfiles, six.string_types):
                if os.path.exists(pfiles):
                    pfiles = [
                        os.path.join(basedir, f)
                        for basedir, subdir, filenames in os.walk(
                            os.path.dirname(pfiles)) for f in filenames
                    ]
                else:
                    pfiles = []
            in_files[(dd.get_sample_name(data), program)].extend(pfiles)
    staged_files = []
    for (sample, program), files in in_files.items():
        cur_dir = utils.safe_makedir(
            os.path.join(base_dir, "inputs", sample, program))
        for f in files:
            if _check_multiqc_input(f) and _is_good_file_for_multiqc(f):
                if _in_temp_directory(f) or any(
                    [cwlutils.is_cwl_run(d) for d in samples]):
                    staged_f = os.path.join(cur_dir, os.path.basename(f))
                    shutil.copy(f, staged_f)
                    staged_files.append(staged_f)
                else:
                    staged_files.append(f)
    staged_files.extend(get_qsig_multiqc_files(samples))
    # Back compatible -- to migrate to explicit specifications in input YAML
    if not any([cwlutils.is_cwl_run(d) for d in samples]):
        staged_files += ["trimmed", "htseq-count/*summary"]
        # Add in created target_info file
        if os.path.isfile(
                os.path.join(base_dir, "report", "metrics",
                             "target_info.yaml")):
            staged_files += [
                os.path.join(base_dir, "report", "metrics", "target_info.yaml")
            ]
    return sorted(list(set(staged_files)))
Пример #2
0
def _get_input_files(samples, base_dir, tx_out_dir):
    """Retrieve input files, keyed by sample and QC method name.

    Stages files into the work directory to ensure correct names for
    MultiQC sample assessment when running with CWL.
    """
    in_files = collections.defaultdict(list)
    for data in samples:
        sum_qc = tz.get_in(["summary", "qc"], data, {})
        if sum_qc in [None, "None"]:
            sum_qc = {}
        elif isinstance(sum_qc, six.string_types):
            sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc}
        elif not isinstance(sum_qc, dict):
            raise ValueError("Unexpected summary qc: %s" % sum_qc)
        for program, pfiles in sum_qc.items():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles.get("secondary", [])
            # CWL: presents output files as single file plus associated secondary files
            elif isinstance(pfiles, six.string_types):
                if os.path.exists(pfiles):
                    pfiles = [os.path.join(basedir, f) for basedir, subdir, filenames in os.walk(os.path.dirname(pfiles)) for f in filenames]
                else:
                    pfiles = []
            in_files[(dd.get_sample_name(data), program)].extend(pfiles)
    staged_files = []
    for (sample, program), files in in_files.items():
        cur_dir = utils.safe_makedir(os.path.join(base_dir, "inputs", sample, program))
        for f in files:
            if _check_multiqc_input(f) and _is_good_file_for_multiqc(f):
                if _in_temp_directory(f) or any([cwlutils.is_cwl_run(d) for d in samples]):
                    staged_f = os.path.join(cur_dir, os.path.basename(f))
                    shutil.copy(f, staged_f)
                    staged_files.append(staged_f)
                else:
                    staged_files.append(f)
    staged_files.extend(get_qsig_multiqc_files(samples))
    # Back compatible -- to migrate to explicit specifications in input YAML
    if not any([cwlutils.is_cwl_run(d) for d in samples]):
        staged_files += ["trimmed", "htseq-count/*summary"]
        # Add in created target_info file
        if os.path.isfile(os.path.join(base_dir, "report", "metrics", "target_info.yaml")):
            staged_files += [os.path.join(base_dir, "report", "metrics", "target_info.yaml")]
    return sorted(list(set(staged_files)))