Exemplo n.º 1
0
def samples_to_records(samples, default_keys=None):
    """Convert samples into output CWL records.
    """
    from bcbio.pipeline import run_info
    RECORD_CONVERT_TO_LIST = set(["config__algorithm__tools_on", "config__algorithm__tools_off",
                                  "reference__genome_context"])
    all_keys = _get_all_cwlkeys(samples, default_keys)
    out = []
    for data in samples:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
            if raw_key not in data["cwl_keys"]:
                data["cwl_keys"].append(raw_key)
            if raw_key in RECORD_CONVERT_TO_LIST:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
            # Booleans are problematic for CWL serialization, convert into string representation
            if isinstance(tz.get_in(key, data), bool):
                data = tz.update_in(data, key, lambda x: str(tz.get_in(key, data)))
        data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {}))
        out.append(data)
    return out
Exemplo n.º 2
0
def _merge_hla_fastq_inputs(data):
    """Merge HLA inputs from a split initial alignment.
    """
    hla_key = ["hla", "fastq"]
    hla_sample_files = [x for x in tz.get_in(hla_key, data, []) if x and x != "None"]
    if hla_sample_files:
        out_files = collections.defaultdict(list)
        for hla_files in hla_sample_files:
            for hla_file in hla_files:
                rehla = re.search(".hla.(?P<hlatype>[\w-]+).fq", hla_file)
                if rehla:
                    hlatype = rehla.group("hlatype")
                    out_files[hlatype].append(hla_file)
        if len(out_files) > 0:
            hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                         dd.get_sample_name(data), "hla"))
            merged_hlas = []
            for hlatype, files in out_files.items():
                out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype))
                optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data)
                merged_hlas.append(out_file)
        data = tz.update_in(data, hla_key, lambda x: merged_hlas)
    else:
        data = tz.update_in(data, hla_key, lambda x: None)
    return data
Exemplo n.º 3
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
        all_keys.update(set(data["cwl_keys"]))
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Exemplo n.º 4
0
def samples_to_records(samples):
    """Convert samples into output CWL records.
    """
    from bcbio.pipeline import run_info
    RECORD_CONVERT_TO_LIST = set([
        "config__algorithm__tools_on", "config__algorithm__tools_off",
        "config__algorithm__svcaller"
    ])
    all_keys = _get_all_cwlkeys(samples)
    out = []
    for data in samples:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in RECORD_CONVERT_TO_LIST:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        data["metadata"] = run_info.add_metadata_defaults(
            data.get("metadata", {}))
        out.append(data)
    return out
Exemplo n.º 5
0
def _place_secondary_files(inp_tool, inp_binding):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.

    This involves using a second input binding to get the secondaryFiles, that
    we ignore downstream. Ideally we could use `valueFrom: null` but that doesn't
    seem to work right now.
    """
    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while tz.get_in(key + ["type"], inp_tool) != "File" and tz.get_in(
                key + ["items"], inp_tool) != "File":
            key.append("type")
        secondary_key = key + ["inputBinding"]
        if tz.get_in(secondary_key, inp_tool):
            inp_tool = tz.update_in(inp_tool,
                                    secondary_key + ["secondaryFiles"],
                                    lambda x: secondary_files)
        else:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, secondary_key,
                                    lambda x: nested_inp_binding)
    return inp_tool
Exemplo n.º 6
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for key in (["work_bam"], ["work_bam_plus",
                               "disc"], ["work_bam_plus", "sr"], ["umi_bam"]):
        in_files = tz.get_in(key, data, [])
        if not isinstance(in_files, (list, tuple)):
            in_files = [in_files]
        in_files = [x for x in in_files if x and x != "None"]
        if in_files:
            ext = "-%s" % key[-1] if len(key) > 1 else ""
            out_file = os.path.join(
                dd.get_work_dir(data), "align", dd.get_sample_name(data),
                "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files,
                                          utils.safe_makedir(
                                              os.path.dirname(out_file)),
                                          data,
                                          out_file=out_file)
            data = tz.update_in(data, key, lambda x: merged_file)
        else:
            data = tz.update_in(data, key, lambda x: None)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Exemplo n.º 7
0
def samples_to_records(samples, default_keys=None):
    """Convert samples into output CWL records.
    """
    from bcbio.pipeline import run_info
    RECORD_CONVERT_TO_LIST = set([
        "config__algorithm__tools_on", "config__algorithm__tools_off",
        "reference__genome_context"
    ])
    all_keys = _get_all_cwlkeys(samples, default_keys)
    out = []
    for data in samples:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
            if raw_key not in data["cwl_keys"]:
                data["cwl_keys"].append(raw_key)
            if raw_key in RECORD_CONVERT_TO_LIST:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
            # Booleans are problematic for CWL serialization, convert into string representation
            if isinstance(tz.get_in(key, data), bool):
                data = tz.update_in(data, key,
                                    lambda x: str(tz.get_in(key, data)))
        data["metadata"] = run_info.add_metadata_defaults(
            data.get("metadata", {}))
        out.append(data)
    return out
Exemplo n.º 8
0
def run_and_save(data):
    """Run QC, saving file outputs in data dictionary.
    """
    run(None, data)
    stats_file, idxstats_file = _get_stats_files(data)
    data = tz.update_in(data, ["depth", "samtools", "stats"], lambda x: stats_file)
    data = tz.update_in(data, ["depth", "samtools", "idxstats"], lambda x: idxstats_file)
    return data
Exemplo n.º 9
0
def run_and_save(data):
    """Run QC, saving file outputs in data dictionary.
    """
    run(None, data)
    stats_file, idxstats_file = _get_stats_files(data)
    data = tz.update_in(data, ["depth", "samtools", "stats"],
                        lambda x: stats_file)
    data = tz.update_in(data, ["depth", "samtools", "idxstats"],
                        lambda x: idxstats_file)
    return data
Exemplo n.º 10
0
 def _compare_dicts(self, orig, new, ns):
     out = {}
     for key, val in new.items():
         nskey = ns + [key]
         orig_val = tz.get_in([key], orig)
         if isinstance(val, dict) and isinstance(orig_val, dict):
             for nkey, nval in self._compare_dicts(orig_val or {}, val or {}, nskey).items():
                 out = tz.update_in(out, [nkey], lambda x: nval)
         elif val != orig_val:
             print nskey, val, orig_val
             out = tz.update_in(out, nskey, lambda x: val)
     return out
Exemplo n.º 11
0
 def _compare_dicts(self, orig, new, ns):
     out = {}
     for key, val in new.items():
         nskey = ns + [key]
         orig_val = tz.get_in([key], orig)
         if isinstance(val, dict) and isinstance(orig_val, dict):
             for nkey, nval in self._compare_dicts(orig_val or {}, val
                                                   or {}, nskey).items():
                 out = tz.update_in(out, [nkey], lambda x: nval)
         elif val != orig_val:
             print nskey, val, orig_val
             out = tz.update_in(out, nskey, lambda x: val)
     return out
Exemplo n.º 12
0
def assign_complex_to_samples(items):
    """Assign complex inputs like variants and align outputs to samples.

    Handles list inputs to record conversion where we have inputs from multiple
    locations and need to ensure they are properly assigned to samples in many
    environments.
    """
    extract_fns = {
        ("variants", "samples"): _get_vcf_samples,
        ("align_bam", ): _get_bam_samples
    }
    complex = {k: {} for k in extract_fns.keys()}
    for data in items:
        for k in complex:
            v = tz.get_in(k, data)
            if v is not None:
                for s in extract_fns[k](v):
                    if s:
                        complex[k][s] = v
    out = []
    for data in items:
        for k in complex:
            newv = tz.get_in([k, dd.get_sample_name(data)], complex)
            if newv:
                data = tz.update_in(data, k, lambda x: newv)
        out.append(data)
    return out
Exemplo n.º 13
0
def _place_secondary_files(inp_tool, inp_binding):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.
    """
    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while tz.get_in(key + ["type"], inp_tool) != "File" and tz.get_in(key + ["items"], inp_tool) != "File":
            key.append("type")
        if tz.get_in(key, inp_tool):
            inp_tool = tz.update_in(inp_tool, key + ["secondaryFiles"], lambda x: secondary_files)
        else:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, key, lambda x: nested_inp_binding)
    return inp_tool
Exemplo n.º 14
0
def vc_output_record(samples):
    """Prepare output record from variant calling to feed into downstream analysis.

    Prep work handles reformatting so we return generated dictionaries.

    For any shared keys that are calculated only once for a batch, like variant calls
    for the batch, we assign to every sample.
    """
    shared_keys = [["vrn_file"], ["validate", "summary"],
                   ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]]
    raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples])
    shared = {}
    for key in shared_keys:
        cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
        if len(cur) > 0:
            assert len(cur) == 1, (key, cur)
            shared[tuple(key)] = cur[0]
        else:
            shared[tuple(key)] = None
    out = []
    for d in raw:
        for key, val in shared.items():
            d = tz.update_in(d, key, lambda x: val)
        out.append([d])
    return out
Exemplo n.º 15
0
def _fill_capture_regions(data):
    """Fill short-hand specification of BED capture regions.
    """
    special_targets = {"sv_regions": ("exons", "transcripts")}
    ref_file = dd.get_ref_file(data)
    for target in ["variant_regions", "sv_regions", "coverage"]:
        val = tz.get_in(["config", "algorithm", target], data)
        if val and not os.path.exists(val) and not objectstore.is_remote(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(
                    os.path.normpath(
                        os.path.join(os.path.dirname(ref_file), os.pardir,
                                     "coverage", val + ext)))
            if len(installed_vals) == 0:
                if target not in special_targets or not val.startswith(
                        special_targets[target]):
                    raise ValueError(
                        "Configuration problem. BED file not found for %s: %s"
                        % (target, val))
            else:
                assert len(installed_vals) == 1, installed_vals
                data = tz.update_in(data, ["config", "algorithm", target],
                                    lambda x: installed_vals[0])
    return data
Exemplo n.º 16
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = glob.glob(
                os.path.normpath(
                    os.path.join(os.path.dirname(ref_file), os.pardir,
                                 "coverage", "prioritize", val + "*.bed.gz")))
            if len(installed_vals) == 0:
                raise ValueError(
                    "Configuration problem. Prioritization file not found for %s: %s"
                    % (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target,
                                lambda x: installed_val)
    return data
Exemplo n.º 17
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir,
                                                                     "coverage", "prioritize", val + "*.bed.gz")))
            if len(installed_vals) == 0:
                raise ValueError("Configuration problem. Prioritization file not found for %s: %s" %
                                 (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val)
    return data
Exemplo n.º 18
0
def assign_complex_to_samples(items):
    """Assign complex inputs like variants and align outputs to samples.

    Handles list inputs to record conversion where we have inputs from multiple
    locations and need to ensure they are properly assigned to samples in many
    environments.

    The unpleasant approach here is to use standard file naming to match
    with samples so this can work in environments where we don't download/stream
    the input files (for space/time savings).
    """
    extract_fns = {
        ("variants", "samples"): _get_vcf_samples,
        ("align_bam", ): _get_bam_samples
    }
    complex = {k: {} for k in extract_fns.keys()}
    for data in items:
        for k in complex:
            v = tz.get_in(k, data)
            if v is not None:
                for s in extract_fns[k](v, data):
                    if s:
                        complex[k][s] = v
    out = []
    for data in items:
        for k in complex:
            newv = tz.get_in([k, dd.get_sample_name(data)], complex)
            if newv:
                data = tz.update_in(data, k, lambda x: newv)
        out.append(data)
    return out
Exemplo n.º 19
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)
    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target}
    return [[data]]
Exemplo n.º 20
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    calcfns = {
        "cnvkit": _calculate_sv_coverage_cnvkit,
        "gatk-cnv": _calculate_sv_coverage_gatk
    }
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        out_target_file, out_anti_file = (None, None)
    else:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](
            data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)
    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
    else:
        seq2c_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {
        "target": out_target_file,
        "antitarget": out_anti_file,
        "seq2c": seq2c_target
    }
    return [[data]]
Exemplo n.º 21
0
def _place_secondary_files(inp_tool, inp_binding=None):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.
    """
    def _is_file(val):
        return (val == "File" or
                (isinstance(val, (list, tuple)) and
                 ("File" in val or any(isinstance(x, dict) and _is_file(val))
                  for x in val)))

    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while (not _is_file(tz.get_in(key + ["type"], inp_tool))
               and not _is_file(tz.get_in(key + ["items"], inp_tool)) and
               not _is_file(tz.get_in(key + ["items", "items"], inp_tool))):
            key.append("type")
        if tz.get_in(key, inp_tool):
            inp_tool["secondaryFiles"] = secondary_files
        elif inp_binding:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, key,
                                    lambda x: nested_inp_binding)
    return inp_tool
Exemplo n.º 22
0
def vc_output_record(samples):
    """Prepare output record from variant calling to feed into downstream analysis.

    Prep work handles reformatting so we return generated dictionaries.

    For any shared keys that are calculated only once for a batch, like variant calls
    for the batch, we assign to every sample.
    """
    shared_keys = [["vrn_file"], ["validate", "summary"], ["validate", "tp"],
                   ["validate", "fp"], ["validate", "fn"]]
    raw = cwlutils.samples_to_records(
        [utils.to_single_data(x) for x in samples])
    shared = {}
    for key in shared_keys:
        cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
        if len(cur) > 0:
            assert len(cur) == 1, (key, cur)
            shared[tuple(key)] = cur[0]
        else:
            shared[tuple(key)] = None
    out = []
    for d in raw:
        for key, val in shared.items():
            d = tz.update_in(d, key, lambda x: val)
        out.append([d])
    return out
Exemplo n.º 23
0
def assign_complex_to_samples(items):
    """Assign complex inputs like variants and align outputs to samples.

    Handles list inputs to record conversion where we have inputs from multiple
    locations and need to ensure they are properly assigned to samples in many
    environments.

    The unpleasant approach here is to use standard file naming to match
    with samples so this can work in environments where we don't download/stream
    the input files (for space/time savings).
    """
    extract_fns = {("variants", "samples"): _get_vcf_samples,
                   ("align_bam",): _get_bam_samples}
    complex = {k: {} for k in extract_fns.keys()}
    for data in items:
        for k in complex:
            v = tz.get_in(k, data)
            if v is not None:
                for s in extract_fns[k](v, items):
                    if s:
                        complex[k][s] = v
    out = []
    for data in items:
        for k in complex:
            newv = tz.get_in([k, dd.get_sample_name(data)], complex)
            if newv:
                data = tz.update_in(data, k, lambda x: newv)
        out.append(data)
    return out
Exemplo n.º 24
0
def _fill_prioritization_targets(data):
    """Fill in globally installed files for prioritization.
    """
    ref_file = dd.get_ref_file(data)
    for target in [["svprioritize"], ["coverage"]]:
        val = tz.get_in(["config", "algorithm"] + target, data)
        if val and not os.path.exists(val):
            installed_vals = []
            # Check prioritize directory
            for ext in [".bed", ".bed.gz"]:
                installed_vals += glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir,
                                                                          "coverage", "prioritize",
                                                                          val + "*%s" % ext)))
            # Check sv-annotation directory for prioritize gene name lists
            if target[-1] == "svprioritize":
                installed_vals += glob.glob(os.path.join(
                    os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))),
                    "%s*" % os.path.basename(val)))
            if len(installed_vals) == 0:
                raise ValueError("Configuration problem. BED file not found for %s: %s" %
                                 (target, val))
            elif len(installed_vals) == 1:
                installed_val = installed_vals[0]
            else:
                # check for partial matches
                installed_val = None
                for v in installed_vals:
                    if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"):
                        installed_val = v
                        break
                # handle date-stamped inputs
                if not installed_val:
                    installed_val = sorted(installed_vals, reverse=True)[0]
            data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val)
    return data
Exemplo n.º 25
0
def add_required_resources(resources):
    """Add empty values for required resources referenced in CWL
    """
    required = [["variation", "cosmic"], ["variation", "dbsnp"]]
    for key in required:
        if not tz.get_in(key, resources):
            resources = tz.update_in(resources, key, lambda x: None)
    return resources
Exemplo n.º 26
0
def post_process(field_funcs_map: A_List[str, Optional[Callable]],
                 to_clean: Dict,
                 default=None) -> Mapping:
    out = to_clean
    for key, func in field_funcs_map:
        f = func if func else identity
        out = update_in(out, [key], f, default=default)
    return out
Exemplo n.º 27
0
def json_expand(json_op, key_name='json'):
    """ Convert a string json object to Python dict in an op. """
    if type(json_op) == dict and key_name in json_op and json_op[key_name]:
        try:
            return update_in(json_op, [key_name], json.loads)
        except JSONDecodeError:
            return assoc(json_op, key_name, {})

    return json_op
Exemplo n.º 28
0
    def transform_network(self, network):
        def update_fn(override_hyperparameters):
            return toolz.merge(override_hyperparameters,
                               self.hyperparameters)

        kwargs = toolz.update_in(transforms.fns.network_to_kwargs(network),
                                 ["override_hyperparameters"],
                                 update_fn)
        return treeano.Network(**kwargs)
Exemplo n.º 29
0
def _update_nested(key, val, data):
    """Update the data object, avoiding over-writing with nested dictionaries.
    """
    if isinstance(val, dict):
        for sub_key, sub_val in val.items():
            data = _update_nested(key + [sub_key], sub_val, data)
    else:
        data = tz.update_in(data, key, lambda x: val)
    return data
Exemplo n.º 30
0
def json_expand(json_op, key_name='json'):
    """ Convert a string json object to Python dict in an op. """
    if type(json_op) == dict and key_name in json_op and json_op[key_name]:
        try:
            return update_in(json_op, [key_name], json.loads)
        except JSONDecodeError:
            return assoc(json_op, key_name, {})

    return json_op
Exemplo n.º 31
0
def _merge_dreams(first, second):
    new_dask = second.dask
    for i in range(second.npartitions):
        # Populate root of first dask into empty of second
        empty_path = dfs_first_empty_path(new_dask)
        new_dask = toolz.update_in(new_dask, empty_path,
                                   lambda _: (first.name, i))
    new_dask = toolz.merge(new_dask, first.dask)
    return type(second)(new_dask, second.name, second.npartitions)
Exemplo n.º 32
0
def _place_secondary_files(inp_tool, inp_binding):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.
    """
    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while tz.get_in(key + ["type"], inp_tool) != "File" and tz.get_in(
                key + ["items"], inp_tool) != "File":
            key.append("type")
        if tz.get_in(key, inp_tool):
            inp_tool = tz.update_in(inp_tool, key + ["secondaryFiles"],
                                    lambda x: secondary_files)
        else:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, key,
                                    lambda x: nested_inp_binding)
    return inp_tool
Exemplo n.º 33
0
def _symlink_to_workdir(data, key):
    """For CWL support, symlink files into a working directory if in read-only imports.
    """
    orig_file = tz.get_in(key, data)
    if orig_file and not orig_file.startswith(dd.get_work_dir(data)):
        variantcaller = genotype.get_variantcaller(data)
        out_file = os.path.join(dd.get_work_dir(data), variantcaller, os.path.basename(orig_file))
        utils.safe_makedir(os.path.dirname(out_file))
        utils.symlink_plus(orig_file, out_file)
        data = tz.update_in(data, key, lambda x: out_file)
    return data
Exemplo n.º 34
0
    def transform_network(self, network):
        # FIXME make this a transform

        def update_fn(override_hyperparameters):
            return toolz.merge(override_hyperparameters,
                               self.hyperparameters)

        kwargs = toolz.update_in(transforms.fns.network_to_kwargs(network),
                                 ["override_hyperparameters"],
                                 update_fn)
        return treeano.Network(**kwargs)
Exemplo n.º 35
0
def add_required_resources(resources):
    """Add empty values for required resources referenced in CWL
    """
    required = [["variation", "cosmic"], ["variation", "dbsnp"],
                ["variation", "lcr"], ["variation", "polyx"],
                ["variation", "encode_blacklist"],
                ["variation", "train_hapmap"], ["variation", "train_indels"]]
    for key in required:
        if not tz.get_in(key, resources):
            resources = tz.update_in(resources, key, lambda x: None)
    return resources
Exemplo n.º 36
0
def _update_nested(key, val, data):
    """Update the data object, avoiding over-writing with nested dictionaries.
    """
    if isinstance(val, dict):
        for sub_key, sub_val in val.items():
            data = _update_nested(key + [sub_key], sub_val, data)
    else:
        if tz.get_in(key, data) is not None:
            raise ValueError("Duplicated key %s" % key)
        data = tz.update_in(data, key, lambda x: val)
    return data
Exemplo n.º 37
0
def _update_nested(key, val, data):
    """Update the data object, avoiding over-writing with nested dictionaries.
    """
    if isinstance(val, dict):
        for sub_key, sub_val in val.items():
            data = _update_nested(key + [sub_key], sub_val, data)
    else:
        if tz.get_in(key, data) is not None:
            raise ValueError("Duplicated key %s" % key)
        data = tz.update_in(data, key, lambda x: val)
    return data
Exemplo n.º 38
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for key in (["work_bam"], ["work_bam_plus", "disc"], ["work_bam_plus", "sr"]):
        in_files = tz.get_in(key, data, [])
        if not isinstance(in_files, (list, tuple)):
            in_files = [in_files]
        in_files = [x for x in in_files if x and x != "None"]
        if in_files:
            ext = "-%s" % key[-1] if len(key) > 1 else ""
            out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                                    "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files, utils.safe_makedir(os.path.dirname(out_file)),
                                          data, out_file=out_file)
            data = tz.update_in(data, key, lambda x: merged_file)
        else:
            data = tz.update_in(data, key, lambda x: None)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Exemplo n.º 39
0
 def _compare_dicts(self, orig, new, ns):
     out = {}
     for key, val in new.items():
         nskey = ns + [key]
         orig_val = tz.get_in([key], orig)
         if isinstance(val, dict) and isinstance(orig_val, dict):
             for nkey, nval in self._compare_dicts(orig_val or {}, val or {}, nskey).items():
                 out = self._merge(out, {nkey: nval})
         elif val != orig_val:
             out = tz.update_in(out, nskey, lambda x: copy.deepcopy(val))
     return out
Exemplo n.º 40
0
def _samples_to_records(samples):
    """Convert samples into output CWL records.
    """
    from bcbio.pipeline import run_info
    RECORD_CONVERT_TO_LIST = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    all_keys = _get_all_cwlkeys(samples)
    out = []
    for data in samples:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in RECORD_CONVERT_TO_LIST:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {}))
        out.append(data)
    return out
Exemplo n.º 41
0
def get_hla_truthset(data):
    """Retrieve expected truth calls for annotating HLA called output.
    """
    val_csv = tz.get_in(["config", "algorithm", "hlavalidate"], data)
    out = {}
    if val_csv and utils.file_exists(val_csv):
        with open(val_csv) as in_handle:
            reader = csv.reader(in_handle)
            reader.next() # header
            for sample, locus, alleles in (l for l in reader if l):
                out = tz.update_in(out, [sample, locus], lambda x: [x.strip() for x in alleles.split(";")])
    return out
Exemplo n.º 42
0
def get_hla_truthset(data):
    """Retrieve expected truth calls for annotating HLA called output.
    """
    val_csv = tz.get_in(["config", "algorithm", "hlavalidate"], data)
    out = {}
    if val_csv and utils.file_exists(val_csv):
        with open(val_csv) as in_handle:
            reader = csv.reader(in_handle)
            reader.next() # header
            for sample, locus, alleles in (l for l in reader if l):
                out = tz.update_in(out, [sample, locus], lambda x: [x.strip() for x in alleles.split(";")])
    return out
Exemplo n.º 43
0
def _get_purecn_dx_files(paired, out):
    """Retrieve files generated by PureCN_Dx
    """
    out_base = "%s-dx" % utils.splitext_plus(out["rds"])[0]
    all_files = []
    for key, ext in [[("mutation_burden", ), "_mutation_burden.csv"],
                     [("plot", "signatures"), "_signatures.pdf"],
                     [("signatures", ), "_signatures.csv"]]:
        cur_file = "%s%s" % (out_base, ext)
        out = tz.update_in(out, key, lambda x: cur_file)
        all_files.append(os.path.basename(cur_file))
    return out_base, out, all_files
Exemplo n.º 44
0
def _get_purecn_dx_files(paired, out):
    """Retrieve files generated by PureCN_Dx
    """
    out_base = "%s-dx" % utils.splitext_plus(out["rds"])[0]
    all_files = []
    for key, ext in [[("mutation_burden",), "_mutation_burden.csv"],
                     [("plot", "signatures"), "_signatures.pdf"],
                     [("signatures",), "_signatures.csv"]]:
        cur_file = "%s%s" % (out_base, ext)
        out = tz.update_in(out, key, lambda x: cur_file)
        all_files.append(os.path.basename(cur_file))
    return out_base, out, all_files
Exemplo n.º 45
0
def _place_secondary_files(inp_tool, inp_binding):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.

    This involves using a second input binding to get the secondaryFiles, that
    we ignore downstream. Ideally we could use `valueFrom: null` but that doesn't
    seem to work right now.
    """
    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while tz.get_in(key + ["type"], inp_tool) != "File" and tz.get_in(key + ["items"], inp_tool) != "File":
            key.append("type")
        secondary_key = key + ["inputBinding"]
        if tz.get_in(secondary_key, inp_tool):
            inp_tool = tz.update_in(inp_tool, secondary_key + ["secondaryFiles"], lambda x: secondary_files)
        else:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, secondary_key, lambda x: nested_inp_binding)
    return inp_tool
Exemplo n.º 46
0
 def _compare_dicts(self, orig, new, ns):
     out = {}
     for key, val in new.items():
         nskey = ns + [key]
         orig_val = tz.get_in([key], orig)
         if isinstance(val, dict) and isinstance(orig_val, dict):
             for nkey, nval in self._compare_dicts(orig_val or {}, val
                                                   or {}, nskey).items():
                 out = self._merge(out, {nkey: nval})
         elif val != orig_val:
             out = tz.update_in(out, nskey, lambda x: copy.deepcopy(val))
     return out
Exemplo n.º 47
0
def _update_nested(key, val, data):
    """Update the data object, avoiding over-writing with nested dictionaries.
    """
    if isinstance(val, dict):
        for sub_key, sub_val in val.items():
            data = _update_nested(key + [sub_key], sub_val, data)
    else:
        already_there = tz.get_in(key, data) is not None
        if already_there and val:
            raise ValueError("Duplicated key %s: %s and %s" % (key, val, tz.get_in(key, data)))
        if val or not already_there:
            data = tz.update_in(data, key, lambda x: val)
    return data
Exemplo n.º 48
0
def remove_index_entry(index: Index, entry: str, id: Union[int, str]) -> Index:
    if entry is None:
        return index
    try:
        if len(index[entry]) == 1:
            return t.dissoc(index, entry)
        else:
            return t.update_in(index, [entry], lambda x: x - {id})
    except KeyError:
        error('''
            It seems the Index is corrupt. Please run 
            `mdn regenerate` and try again''')
        assert False  # just for mypy <3
Exemplo n.º 49
0
def _get_purecn_dx_files(paired, out, require_exist=False):
    """Retrieve files generated by PureCN_Dx"""
    out_base = utils.splitext_plus(out["rds"])[0]
    all_files = []
    for key, ext in [[("mutation_burden", ), "_mutation_burden.csv"],
                     [("plot", "signatures"), "_signatures.pdf"],
                     [("signatures", ), "_signatures.csv"],
                     [("chrom_instability", ), "_cin.csv"]]:
        cur_file = f"{out_base}{ext}"
        if not require_exist or os.path.exists(cur_file):
            out = tz.update_in(out, key, lambda x: cur_file)
            all_files.append(os.path.basename(cur_file))
    return out_base, out, all_files
Exemplo n.º 50
0
def _update_nested(key, val, data):
    """Update the data object, avoiding over-writing with nested dictionaries.
    """
    if isinstance(val, dict):
        for sub_key, sub_val in val.items():
            data = _update_nested(key + [sub_key], sub_val, data)
    else:
        already_there = tz.get_in(key, data) is not None
        if already_there and val:
            raise ValueError("Duplicated key %s: %s and %s" % (key, val, tz.get_in(key, data)))
        if val or not already_there:
            data = tz.update_in(data, key, lambda x: val)
    return data
Exemplo n.º 51
0
def _get_support(parts):
    """Retrieve supporting information for potentially multiple samples.

    Convert speedseqs numbering scheme back into sample and support information.
    sample_ids are generated like 20 or 21, where the first number is sample number
    and the second is the type of supporting evidence.
    """
    out = {}
    for sample_id, read_count in (x.split(",") for x in parts[11].split(":")[-1].split(";")):
        support_type = SUPPORT_NUMS[sample_id[-1]]
        sample_id = int(sample_id[:-1]) - 1
        out = tz.update_in(out, [sample_id, support_type], lambda x: x + int(read_count), 0)
    return out
Exemplo n.º 52
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for keys in (["work_bam"], ["work_bam-plus", "disc"], ["work_bam-plus", "sr"]):
        in_files = tz.get_in(keys, data)
        if in_files:
            ext = "-%s" % keys[-1] if len(keys) > 1 else ""
            out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                                    "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files, os.path.dirname(out_file), data["config"], out_file=out_file)
            data = tz.update_in(data, keys, lambda x: merged_file)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Exemplo n.º 53
0
def add_required_resources(resources):
    """Add default or empty values for required resources referenced in CWL
    """
    required = [["variation", "cosmic"], ["variation", "clinvar"], ["variation", "dbsnp"],
                ["variation", "lcr"], ["variation", "polyx"],
                ["variation", "encode_blacklist"], ["variation", "gc_profile"],
                ["variation", "germline_het_pon"],
                ["variation", "train_hapmap"], ["variation", "train_indels"],
                ["variation", "editing"], ["variation", "exac"], ["variation", "esp"],
                ["variation", "gnomad_exome"],
                ["variation", "1000g"], ["aliases", "human"]]
    for key in required:
        if not tz.get_in(key, resources):
            resources = tz.update_in(resources, key, lambda x: None)
    return resources
Exemplo n.º 54
0
def _fill_validation_targets(data):
    """Fill validation targets pointing to globally installed truth sets.
    """
    ref_file = dd.get_ref_file(data)
    sv_targets = zip(itertools.repeat("svvalidate"),
                     tz.get_in(["config", "algorithm", "svvalidate"], data, {}).keys())
    for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"]] + sv_targets]:
        val = tz.get_in(["config", "algorithm"] + vtarget, data)
        if val and not os.path.exists(val):
            installed_val = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "validation", val))
            if os.path.exists(installed_val):
                data = tz.update_in(data, ["config", "algorithm"] + vtarget, lambda x: installed_val)
            else:
                raise ValueError("Configuration problem. Validation file not found for %s: %s" %
                                 (vtarget, val))
    return data
Exemplo n.º 55
0
def _world_from_cwl(fnargs, work_dir):
    """Reconstitute a bcbio world data object from flattened CWL-compatible inputs.

    Converts the flat CWL representation into a nested bcbio world dictionary.
    """
    data = {}
    for fnarg in fnargs:
        key, val = fnarg.split("=")
        key = key.split("__")
        if val.startswith(("{", "[")):
            val = json.loads(val)
        data = tz.update_in(data, key, lambda x: val)
    data["dirs"] = {"work": work_dir}
    # XXX Determine cores and other resources from CWL
    data["config"]["resources"] = {}
    data = run_info.normalize_world(data)
    return [data]
Exemplo n.º 56
0
def _world_from_cwl(fnargs):
    """Reconstitute a bcbio world data object from flattened CWL-compatible inputs.

    Converts the flat CWL object into nested, potentially multi-sample bcbio world
    objects.
    """
    samples = {}
    for fnarg in fnargs:
        key, val = fnarg.split("=")
        key = key.split("__")
        if val.startswith(("{", "[")):
            val = json.loads(val)
        samples = tz.update_in(samples, key, lambda x: val)
    out = []
    for sample in sorted(samples.keys()):
        out.append(samples[sample])
    return [out]
Exemplo n.º 57
0
def _place_secondary_files(inp_tool, inp_binding=None):
    """Put secondaryFiles at the level of the File item to ensure indexes get passed.
    """
    def _is_file(val):
        return (val == "File" or (isinstance(val, (list, tuple)) and "File" in val))
    secondary_files = inp_tool.pop("secondaryFiles", None)
    if secondary_files:
        key = []
        while (not _is_file(tz.get_in(key + ["type"], inp_tool))
               and not _is_file(tz.get_in(key + ["items"], inp_tool))):
            key.append("type")
        if tz.get_in(key, inp_tool):
            inp_tool["secondaryFiles"] = secondary_files
        elif inp_binding:
            nested_inp_binding = copy.deepcopy(inp_binding)
            nested_inp_binding["prefix"] = "ignore="
            nested_inp_binding["secondaryFiles"] = secondary_files
            inp_tool = tz.update_in(inp_tool, key, lambda x: nested_inp_binding)
    return inp_tool