Пример #1
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.debug("Combining fastq and BAM files %s" % str(data["name"]))
    config = config_utils.update_w_custom(data["config"], data["info"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"],
                                             data["dirs"]["work"], config)
    else:
        fastq1, fastq2 = None, None

    out_file = os.path.join(data["dirs"]["work"],
                            data["info"]["rgnames"]["sample"] + ".bam")
    sort_bam = merge_bam_files(data["bam_files"],
                               data["dirs"]["work"],
                               config,
                               out_file=out_file)
    return [[{
        "name": data["name"],
        "metadata": data["info"].get("metadata", {}),
        "info": data["info"],
        "genome_build": data["genome_build"],
        "sam_ref": data["sam_ref"],
        "work_bam": sort_bam,
        "fastq1": fastq1,
        "fastq2": fastq2,
        "dirs": data["dirs"],
        "config": config,
        "config_file": data["config_file"]
    }]]
Пример #2
0
def process_sample(sample_name, fastq_files, info, bam_files, dirs,
                   config, config_file):
    """Finalize processing for a sample, potentially multiplexed.
    """
    config = _update_config_w_custom(config, info)

    genome_build = info.get("genome_build", None)
    (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"],
                                  dirs["galaxy"])
    fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"])
    log.info("Combining and preparing wig file %s" % str(sample_name))
    sort_bam = merge_bam_files(bam_files, dirs["work"], config)
    (gatk_bam, vrn_file, effects_file) = ("", "", "")
    if config["algorithm"]["recalibrate"]:
        log.info("Recalibrating %s with GATK" % str(sample_name))
        gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref,
                                       dirs, config)
        if config["algorithm"]["snpcall"]:
            log.info("SNP genotyping %s with GATK" % str(sample_name))
            vrn_file = run_genotyper(gatk_bam, sam_ref, config)
            log.info("Calculating variation effects for %s" % str(sample_name))
            effects_file = variation_effects(vrn_file, genome_build, config)
    if config["algorithm"].get("transcript_assemble", False):
        tx_file = assemble_transcripts(sort_bam, sam_ref, config)
    if sam_ref is not None:
        log.info("Generating summary files: %s" % str(sample_name))
        generate_align_summary(sort_bam, fastq2 is not None, sam_ref,
                               sample_name, config, dirs)
    bam_to_wig(sort_bam, config, config_file)
    return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file,
            effects_file]
Пример #3
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for key in (["work_bam"], ["work_bam_plus",
                               "disc"], ["work_bam_plus", "sr"], ["umi_bam"]):
        in_files = tz.get_in(key, data, [])
        if not isinstance(in_files, (list, tuple)):
            in_files = [in_files]
        in_files = [x for x in in_files if x and x != "None"]
        if in_files:
            ext = "-%s" % key[-1] if len(key) > 1 else ""
            out_file = os.path.join(
                dd.get_work_dir(data), "align", dd.get_sample_name(data),
                "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files,
                                          utils.safe_makedir(
                                              os.path.dirname(out_file)),
                                          data,
                                          out_file=out_file)
            data = tz.update_in(data, key, lambda x: merged_file)
        else:
            data = tz.update_in(data, key, lambda x: None)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Пример #4
0
def merge_extras(items, config):
    """Merge extra disambiguated reads into a final BAM file.
    """
    final = {}
    for extra_name in items[0]["disambiguate"].keys():
        items_by_name = collections.defaultdict(list)
        for data in items:
            items_by_name[dd.get_sample_name(data)].append(data)
        for sname, name_items in items_by_name.items():
            if sname not in final:
                final[sname] = {}
            in_files = []
            for data in name_items:
                in_files.append(data["disambiguate"][extra_name])
            out_file = "%s-allmerged%s" % os.path.splitext(in_files[0])
            if in_files[0].endswith(".bam"):
                merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config,
                                                    out_file=out_file)
            else:
                assert extra_name == "summary", extra_name
                merged_file = _merge_summary(in_files, out_file, name_items[0])
            final[sname][extra_name] = merged_file
    out = []
    for data in items:
        data["disambiguate"] = final[dd.get_sample_name(data)]
        out.append([data])
    return out
Пример #5
0
def merge_extras(items, config):
    """Merge extra disambiguated reads into a final BAM file.
    """
    final = {}
    for extra_name in items[0]["disambiguate"].keys():
        items_by_name = collections.defaultdict(list)
        for data in items:
            items_by_name[dd.get_sample_name(data)].append(data)
        for sname, name_items in items_by_name.items():
            if sname not in final:
                final[sname] = {}
            in_files = []
            for data in name_items:
                in_files.append(data["disambiguate"][extra_name])
            out_file = "%s-allmerged%s" % os.path.splitext(in_files[0])
            if in_files[0].endswith(".bam"):
                merged_file = merge.merge_bam_files(in_files,
                                                    os.path.dirname(out_file),
                                                    config,
                                                    out_file=out_file)
            else:
                assert extra_name == "summary", extra_name
                merged_file = _merge_summary(in_files, out_file, name_items[0])
            final[sname][extra_name] = merged_file
    out = []
    for data in items:
        data["disambiguate"] = final[dd.get_sample_name(data)]
        out.append([data])
    return out
Пример #6
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.debug("Combining fastq and BAM files %s" % str(data["name"]))
    config = config_utils.update_w_custom(data["config"], data["info"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config)
    else:
        fastq1, fastq2 = None, None

    out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam")
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file)
    return [
        [
            {
                "name": data["name"],
                "metadata": data["info"].get("metadata", {}),
                "info": data["info"],
                "genome_build": data["genome_build"],
                "sam_ref": data["sam_ref"],
                "work_bam": sort_bam,
                "fastq1": fastq1,
                "fastq2": fastq2,
                "dirs": data["dirs"],
                "config": config,
                "config_file": data["config_file"],
            }
        ]
    ]
Пример #7
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.info("Combining fastq and BAM files %s" % str(data["name"]))
    config = shared.update_config_w_custom(data["config"], data["info"])
    genome_build, sam_ref = shared.ref_genome_info(data["info"], config,
                                                   data["dirs"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"],
                                             data["dirs"]["work"], config)
    else:
        fastq1, fastq2 = None, None
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config)
    return [[{
        "name": data["name"],
        "metadata": data["info"].get("metadata", {}),
        "info": data["info"],
        "genome_build": genome_build,
        "sam_ref": sam_ref,
        "work_bam": sort_bam,
        "fastq1": fastq1,
        "fastq2": fastq2,
        "dirs": data["dirs"],
        "config": config,
        "config_file": data["config_file"]
    }]]
Пример #8
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        in_files = sorted(list(set([data[file_key]] + extras)))
        out_file = data["combine"][file_key]["out"]
        for ext in ["-disc", "-sr", ""]:
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, ext) if len(in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                merged_file = merge_bam_files(cur_in_files, os.path.dirname(cur_out_file), config,
                                              out_file=cur_out_file)
        data.pop("region", None)
        data.pop("combine", None)
        data[file_key] = merged_file
    return [[data]]
Пример #9
0
def merge_extras(in_files, out_file, config):
    """Merge extra disambiguated reads into a final BAM file.
    """

    merged_file = merge.merge_bam_files(in_files,
                                        os.path.dirname(out_file),
                                        config,
                                        out_file=out_file)
    return merged_file
Пример #10
0
def merge_extras(in_files, out_file, config):
    """Merge extra disambiguated reads into a final BAM file.
    """

    merged_file = merge.merge_bam_files(in_files,
                                        os.path.dirname(out_file),
                                        config,
                                        out_file=out_file)
    return merged_file
Пример #11
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        if file_key in data:
            extras.append(data[file_key])
        in_files = sorted(list(set(extras)))
        out_file = tz.get_in(["combine", file_key, "out"], data,
                             _merge_out_from_infiles(in_files))
        sup_exts = data.get(file_key + "-plus", {}).keys()
        for ext in sup_exts + [""]:
            merged_file = None
            if os.path.exists(utils.append_stem(out_file, "-" + ext)):
                cur_out_file, cur_in_files = out_file, []
            if ext:
                cur_in_files = list(
                    filter(os.path.exists, (utils.append_stem(f, "-" + ext)
                                            for f in in_files)))
                cur_out_file = utils.append_stem(
                    out_file, "-" + ext) if len(cur_in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                if len(cur_in_files) > 0:
                    merged_file = merge_bam_files(
                        cur_in_files,
                        os.path.dirname(cur_out_file),
                        config,
                        out_file=cur_out_file)
                else:
                    assert os.path.exists(cur_out_file)
                    merged_file = cur_out_file
            if merged_file:
                if ext:
                    data[file_key + "-plus"][ext] = merged_file
                else:
                    data[file_key] = merged_file
        data.pop("region", None)
        data.pop("combine", None)
    return [[data]]
Пример #12
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for keys in (["work_bam"], ["work_bam-plus", "disc"], ["work_bam-plus", "sr"]):
        in_files = tz.get_in(keys, data)
        if in_files:
            ext = "-%s" % keys[-1] if len(keys) > 1 else ""
            out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                                    "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files, os.path.dirname(out_file), data["config"], out_file=out_file)
            data = tz.update_in(data, keys, lambda x: merged_file)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Пример #13
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.info("Combining fastq and BAM files %s" % str(data["name"]))
    config = _update_config_w_custom(data["config"], data["info"])
    genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"])
    fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"],
                                         config)
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config)

    return [[{"name": data["name"],
              "genome_build": genome_build, "sam_ref": sam_ref,
              "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2,
              "dirs": data["dirs"], "config": config,
              "config_file": data["config_file"]}]]
Пример #14
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        in_files = list(set([data[file_key]] + data["combine"][file_key].get("extras", [])))
        out_file = data["combine"][file_key]["out"]
        logger.debug("Combining BAM files to %s" % out_file)
        config = copy.deepcopy(data["config"])
        config["algorithm"]["save_diskspace"] = False
        merged_file = merge_bam_files(in_files, os.path.dirname(out_file), config,
                                      out_file=out_file)
        if data.has_key("region"):
            del data["region"]
        data[file_key] = merged_file
    return [[data]]
Пример #15
0
def delayed_bam_merge(data):
    """Perform a merge on previously prepped files, delayed in processing.

    Handles merging of associated split read and discordant files if present.
    """
    if data.get("combine"):
        assert len(data["combine"].keys()) == 1
        file_key = data["combine"].keys()[0]
        extras = []
        for x in data["combine"][file_key].get("extras", []):
            if isinstance(x, (list, tuple)):
                extras.extend(x)
            else:
                extras.append(x)
        if file_key in data:
            extras.append(data[file_key])
        in_files = sorted(list(set(extras)))
        out_file = tz.get_in(["combine", file_key, "out"], data, _merge_out_from_infiles(in_files))
        sup_exts = data.get(file_key + "-plus", {}).keys()
        for ext in sup_exts + [""]:
            merged_file = None
            if os.path.exists(utils.append_stem(out_file, "-" + ext)):
                cur_out_file, cur_in_files = out_file, []
            if ext:
                cur_in_files = list(filter(os.path.exists, (utils.append_stem(f, "-" + ext) for f in in_files)))
                cur_out_file = utils.append_stem(out_file, "-" + ext) if len(cur_in_files) > 0 else None
            else:
                cur_in_files, cur_out_file = in_files, out_file
            if cur_out_file:
                config = copy.deepcopy(data["config"])
                config["algorithm"]["save_diskspace"] = False
                if len(cur_in_files) > 0:
                    merged_file = merge_bam_files(
                        cur_in_files, os.path.dirname(cur_out_file), config, out_file=cur_out_file
                    )
                else:
                    assert os.path.exists(cur_out_file)
                    merged_file = cur_out_file
            if merged_file:
                if ext:
                    data[file_key + "-plus"][ext] = merged_file
                else:
                    data[file_key] = merged_file
        data.pop("region", None)
        data.pop("combine", None)
    return [[data]]
Пример #16
0
def _merge_align_bams(data):
    """Merge multiple alignment BAMs, including split and discordant reads.
    """
    for key in (["work_bam"], ["work_bam_plus", "disc"], ["work_bam_plus", "sr"]):
        in_files = tz.get_in(key, data)
        if in_files:
            if not isinstance(in_files, (list, tuple)):
                in_files = [in_files]
            ext = "-%s" % key[-1] if len(key) > 1 else ""
            out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                                    "%s-sort%s.bam" % (dd.get_sample_name(data), ext))
            merged_file = merge_bam_files(in_files, utils.safe_makedir(os.path.dirname(out_file)),
                                          data["config"], out_file=out_file)
            data = tz.update_in(data, key, lambda x: merged_file)
    if "align_bam" in data and "work_bam" in data:
        data["align_bam"] = data["work_bam"]
    return data
Пример #17
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.info("Combining fastq and BAM files %s" % str(data["name"]))
    config = _update_config_w_custom(data["config"], data["info"])
    genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"],
                                             config)
    else:
        fastq1, fastq2 = None, None
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config)
    return [[{"name": data["name"], "metadata": data["info"].get("metadata", {}),
              "genome_build": genome_build, "sam_ref": sam_ref,
              "work_bam": sort_bam, "fastq1": fastq1, "fastq2": fastq2,
              "dirs": data["dirs"], "config": config,
              "config_file": data["config_file"]}]]
Пример #18
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.info("Combining fastq and BAM files %s" % str(data["name"]))
    config = _update_config_w_custom(data["config"], data["info"])
    genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"])
    fastq1, fastq2 = combine_fastq_files(data["fastq_files"],
                                         data["dirs"]["work"], config)
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config)

    return [[{
        "name": data["name"],
        "genome_build": genome_build,
        "sam_ref": sam_ref,
        "work_bam": sort_bam,
        "fastq1": fastq1,
        "fastq2": fastq2,
        "dirs": data["dirs"],
        "config": config,
        "config_file": data["config_file"]
    }]]
Пример #19
0
def merge_extras(items, config):
    """Merge extra disambiguated reads into a final BAM file.
    """
    final = {}
    for extra_name in items[0]["disambiguate"].keys():
        in_files = []
        for data in items:
            in_files.append(data["disambiguate"][extra_name])
        out_file = "%s-allmerged%s" % os.path.splitext(in_files[0])
        if in_files[0].endswith(".bam"):
            merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), items[0],
                                                out_file=out_file)
        else:
            assert extra_name == "summary", extra_name
            merged_file = _merge_summary(in_files, out_file, items[0])
        final[extra_name] = merged_file
    out = []
    for data in items:
        data["disambiguate"] = final
        out.append([data])
    return out
Пример #20
0
def merge_extras(items, config):
    """Merge extra disambiguated reads into a final BAM file.
    """
    final = {}
    for extra_name in items[0]["disambiguate"].keys():
        in_files = []
        for data in items:
            in_files.append(data["disambiguate"][extra_name])
        out_file = "%s-allmerged%s" % os.path.splitext(in_files[0])
        if in_files[0].endswith(".bam"):
            print out_file, in_files
            merged_file = merge.merge_bam_files(in_files, os.path.dirname(out_file), config,
                                                out_file=out_file)
        else:
            assert extra_name == "summary", extra_name
            merged_file = _merge_summary(in_files, out_file, items[0])
        final[extra_name] = merged_file
    out = []
    for data in items:
        data["disambiguate"] = final
        out.append([data])
    return out