Пример #1
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Пример #2
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                       "{fastq_file} {pair_file} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Пример #3
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Пример #4
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Пример #5
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("samtools", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease").upper()
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with tx_tmpdir(data) as tmpdir:
            with file_transaction(data, sr_file) as tx_sr_file:
                with file_transaction(data, disc_file) as tx_disc_file:
                    with file_transaction(data, dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir,
                                                "%s-namesort" % os.path.splitext(os.path.basename(in_bam))[0])
                        cmd = ("{samtools} sort -n -@ {cores} -m {mem} -O sam -T {out_base} {in_bam} | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Пример #6
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Пример #7
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
        shutil.copy(bam_files[0], out_file)
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(
                                config,
                                "%s.list" % os.path.splitext(out_file)[0]
                        ) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    bam.index(out_file, config)
    return out_file
Пример #8
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        export = utils.local_path_export()
        cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Пример #9
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-")
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Пример #10
0
def sort(in_bam, config, order="coordinate", out_dir=None):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order, out_dir)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            # Slightly decrease memory and allow more accurate representation
            # in Mb to ensure fits within systems like SLURM
            mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                             1.25, "decrease", out_modifier="M").upper()
            cmd = ("{samtools} sort -@ {cores} -m {mem} -O BAM {order_flag} "
                   "-T {tx_sort_stem}-sort -o {tx_sort_file} {in_bam}")
            do.run(cmd.format(**locals()), "Sort BAM file %s: %s to %s" %
                   (order, os.path.basename(in_bam), os.path.basename(sort_file)))
    return sort_file
Пример #11
0
def sort(in_bam, config, order="coordinate", out_dir=None):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order, out_dir)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            # Slightly decrease memory and allow more accurate representation
            # in Mb to ensure fits within systems like SLURM
            mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                             1.25, "decrease", out_modifier="M").upper()
            cmd = ("{samtools} sort -@ {cores} -m {mem} -O BAM {order_flag} "
                   "-T {tx_sort_stem}-sort -o {tx_sort_file} {in_bam}")
            do.run(cmd.format(**locals()), "Sort BAM file %s: %s to %s" %
                   (order, os.path.basename(in_bam), os.path.basename(sort_file)))
    return sort_file
Пример #12
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir,
                                       dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = (
            "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
            "--skip-duplicated --skip-dup-mode 0 "
            "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(
                tz.get_in("genome_build", data, "").startswith(k)
                for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(
            dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()),
               "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Пример #13
0
def _rnaseq_qualimap_cmd(data,
                         bam_file,
                         out_dir,
                         gtf_file=None,
                         library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
        utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir)
    if library != "non-strand-specific":
        logger.info(
            "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation."
        )
        library = "non-strand-specific"
    paired = " --paired" if bam.is_paired(bam_file) else ""
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library}{paired} "
           "-gtf {gtf_file}").format(**locals())
    return cmd
Пример #14
0
def _run_qualimap(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    if not os.path.exists(report_file):
        ds_bam = bam.downsample(bam_file, data, 1e7)
        bam_file = ds_bam if ds_bam else bam_file
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        resources = config_utils.get_resources("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = (
            "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
            "-nt {num_cores} --java-mem-size={max_mem}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data,
                            "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1])
    return _parse_qualimap_metrics(report_file)
Пример #15
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Пример #16
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Пример #17
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samblaster = config_utils.get_program("samblaster", data["config"])
    sambamba = config_utils.get_program("sambamba", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file):
        with file_transaction(sr_file) as tx_sr_file:
            with file_transaction(disc_file) as tx_disc_file:
                with utils.curdir_tmpdir() as tmpdir:
                    tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | "
                                 "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} "
                                 "-o {out_file} /dev/stdin")
                    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals())
                    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals())
                    cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} "
                           "-n -o /dev/stdout -l 0 {in_bam} | "
                           "{sambamba} view -h /dev/stdin | "
                           "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
                           "-o /dev/null")
                    do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data)
    return sr_file, disc_file
Пример #18
0
def sort(in_bam, config, order="coordinate"):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            mem = resources.get("memory", "2G")
            samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} "
                            "{in_bam} {tx_sort_stem}")
            if sambamba:
                if tz.get_in(["resources", "sambamba"], config):
                    sm_resources = config_utils.get_resources(
                        "sambamba", config)
                    mem = sm_resources.get("memory", "2G")
                # sambamba uses total memory, not memory per core
                mem = config_utils.adjust_memory(mem, cores,
                                                 "increase").upper()
                # Use samtools compatible natural sorting
                # https://github.com/lomereiter/sambamba/issues/132
                order_flag = "--natural-sort" if order == "queryname" else ""
                cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} "
                       "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}")
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(
                    cmd.format(**locals()),
                    "Sort BAM file (multi core, %s): %s to %s" %
                    (order, os.path.basename(in_bam),
                     os.path.basename(sort_file)))
            except:
                logger.exception(
                    "Multi-core sorting failed, reverting to single core")
                resources = config_utils.get_resources("samtools", config)
                mem = resources.get("memory", "2G")
                cores = 1
                order_flag = "-n" if order == "queryname" else ""
                do.run(
                    samtools_cmd.format(**locals()),
                    "Sort BAM file (single core, %s): %s to %s" %
                    (order, os.path.basename(in_bam),
                     os.path.basename(sort_file)))
    return sort_file
Пример #19
0
def _get_cores_memory(data, downscale=2):
    """Retrieve cores and memory, using samtools as baseline.

    For memory, scaling down because we share with alignment and de-duplication.
    """
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), downscale, "decrease").upper()
    return num_cores, max_mem
Пример #20
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources(
                            "samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config,
                                                  "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Пример #21
0
def _get_cores_memory(data, downscale=2):
    """Retrieve cores and memory, using samtools as baseline.

    For memory, scaling down because we share with alignment and de-duplication.
    """
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         downscale, "decrease").upper()
    return num_cores, max_mem
Пример #22
0
def sort_cmd(config, tmp_dir, named_pipe=None, order="coordinate"):
    """ Get a sort command, suitable for piping
    """
    sambamba = _get_sambamba(config)
    pipe = named_pipe if named_pipe else "/dev/stdin"
    order_flag = "-n" if order == "queryname" else ""
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    mem = config_utils.adjust_memory(resources.get("memory", "2G"), 1, "decrease").upper()
    cmd = ("{sambamba} sort -m {mem} --tmpdir {tmp_dir} -t {num_cores} {order_flag} -o /dev/stdout {pipe}")
    return cmd.format(**locals())
Пример #23
0
def sort_cmd(config, tmp_dir, named_pipe=None, order="coordinate"):
    """ Get a sort command, suitable for piping
    """
    sambamba = _get_sambamba(config)
    pipe = named_pipe if named_pipe else "/dev/stdin"
    order_flag = "-n" if order == "queryname" else ""
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    mem = config_utils.adjust_memory(resources.get("memory", "2G"), 1, "decrease").upper()
    cmd = ("{sambamba} sort -m {mem} --tmpdir {tmp_dir} -t {num_cores} {order_flag} -o /dev/stdout {pipe}")
    return cmd.format(**locals())
Пример #24
0
def _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file=None, single_end=None):
    """
    Create command lines for qualimap
    """
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", 1)
    max_mem = config_utils.adjust_memory(resources.get("memory", "4G"),
                                         num_cores)
    cmd = ("unset DISPLAY && {qualimap} rnaseq -outdir {out_dir} -a proportional -bam {bam_file} "
           "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals())
    return cmd
Пример #25
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" %
                                              os.path.splitext(out_file)[0]
                                              ) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (
                                merge_cl + " | "
                                "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                            )
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Пример #26
0
def _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file=None, single_end=None):
    """
    Create command lines for qualimap
    """
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", 1)
    max_mem = config_utils.adjust_memory(resources.get("memory", "4G"),
                                         num_cores)
    cmd = ("unset DISPLAY && {qualimap} rnaseq -outdir {out_dir} -a proportional -bam {bam_file} "
           "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals())
    return cmd
Пример #27
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    samtools = config_utils.get_program("samtools", data["config"])
    bwa = config_utils.get_program("bwa", data["config"])
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3,
                                         "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not can_pipe(fastq_file, data):
            return align(fastq_file, pair_file, ref_file, names, align_dir,
                         data)
        else:
            with utils.curdir_tmpdir() as work_dir:
                with file_transaction(out_file) as tx_out_file:
                    tx_out_prefix = os.path.splitext(tx_out_file)[0]
                    cmd = (
                        "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                        "{fastq_file} {pair_file} "
                        "| {samtools} view -b -S -u - "
                        "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    cmd = cmd.format(**locals())
                    do.run(
                        cmd,
                        "bwa mem alignment from fastq: %s" % names["sample"],
                        None, [
                            do.file_nonempty(tx_out_file),
                            do.file_reasonable_size(tx_out_file, fastq_file)
                        ])
    data["work_bam"] = out_file
    return data
Пример #28
0
def sort(in_bam, config, order="coordinate"):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            mem = resources.get("memory", "2G")
            samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} "
                            "{in_bam} {tx_sort_stem}")
            if sambamba:
                if tz.get_in(["resources", "sambamba"], config):
                    sm_resources = config_utils.get_resources("sambamba", config)
                    mem = sm_resources.get("memory", "2G")
                # sambamba uses total memory, not memory per core
                mem = config_utils.adjust_memory(mem, cores, "increase").upper()
                # Use samtools compatible natural sorting
                # https://github.com/lomereiter/sambamba/issues/132
                order_flag = "--natural-sort" if order == "queryname" else ""
                cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} "
                       "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}")
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(cmd.format(**locals()),
                       "Sort BAM file (multi core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
            except:
                logger.exception("Multi-core sorting failed, reverting to single core")
                resources = config_utils.get_resources("samtools", config)
                mem = resources.get("memory", "2G")
                cores = 1
                order_flag = "-n" if order == "queryname" else ""
                do.run(samtools_cmd.format(**locals()),
                       "Sort BAM file (single core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
    return sort_file
Пример #29
0
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    cmd = ("unset DISPLAY && {qualimap} rnaseq -outdir {out_dir} -a proportional -bam {bam_file} -p {library} "
           "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals())
    return cmd
Пример #30
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
               "--skip-duplicated --skip-dup-mode 0 "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Пример #31
0
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], data["config"], "coordinate"):
            with file_transaction(data, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, data["config"])
            out_file = bam_files[0]
            samtools = config_utils.get_program("samtools", data["config"])
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            with tx_tmpdir(data) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(data, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, data["config"])
                        sambamba = config_utils.get_program(
                            "sambamba", data["config"])
                        samtools = config_utils.get_program(
                            "samtools", data["config"])
                        resources = config_utils.get_resources(
                            "samtools", data["config"])
                        num_cores = dd.get_num_cores(data)
                        # Aim for 3.5Gb/core memory for BAM merging
                        num_cores = config_utils.adjust_cores_to_mb_target(
                            3500, resources.get("memory", "2G"), num_cores)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if dd.get_mark_duplicates(data):
                            cmd = _biobambam_merge_dedup_maxcov(data)
                        else:
                            cmd = _biobambam_merge_maxcov(data)
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, data["config"])
    bam.index(out_file, data["config"])
    return out_file
Пример #32
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        bam.index(bam_files[0], config)
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            samblaster = config_utils.get_program("samblaster", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Пример #33
0
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library} "
           "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals())
    return cmd
Пример #34
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config, "coordinate"):
            with file_transaction(config, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, config)
                shutil.copy(bam_files[0], tx_out_file)
            samtools = config_utils.get_program("samtools", config)
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, config)
                        sambamba = config_utils.get_program("sambamba", config)
                        samtools = config_utils.get_program("samtools", config)
                        resources = config_utils.get_resources("samtools", config)
                        num_cores = config["algorithm"].get("num_cores", 1)
                        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                             2, "decrease").upper()
                        if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                            cmd = _sambamba_merge(bam_files)
                        else:
                            # Aim for 3.5Gb/core memory for BAM merging
                            num_cores = config_utils.adjust_cores_to_mb_target(
                                3500, resources.get("memory", "2G"), num_cores)
                            assert config.get("mark_duplicates", True)
                            cmd = _biobambam_merge_dedup()
                        do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                None)
                        do.run('{} quickcheck -v {}'.format(samtools, tx_out_file),
                               "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, config)
    bam.index(out_file, config)
    return out_file
Пример #35
0
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
        utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir)
    paired = " --paired" if bam.is_paired(bam_file) else ""
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library}{paired} "
           "-gtf {gtf_file}").format(**locals())
    return cmd
Пример #36
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    samtools = config_utils.get_program("samtools", data["config"])
    bwa = config_utils.get_program("bwa", data["config"])
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not can_pipe(fastq_file, data):
            return align(fastq_file, pair_file, ref_file, names, align_dir, data)
        else:
            with utils.curdir_tmpdir() as work_dir:
                with file_transaction(out_file) as tx_out_file:
                    tx_out_prefix = os.path.splitext(tx_out_file)[0]
                    cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                           "{fastq_file} {pair_file} "
                           "| {samtools} view -b -S -u - "
                           "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                    cmd = cmd.format(**locals())
                    do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                           [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Пример #37
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Пример #38
0
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], data["config"], "coordinate"):
            with file_transaction(data, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, data["config"])
            out_file = bam_files[0]
            samtools = config_utils.get_program("samtools", data["config"])
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            with tx_tmpdir(data) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(data, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(bam_files, tx_out_file, data["config"])
                        samtools = config_utils.get_program("samtools", data["config"])
                        resources = config_utils.get_resources("samtools", data["config"])
                        num_cores = dd.get_num_cores(data)
                        # Aim for 3.5Gb/core memory for BAM merging
                        num_cores = config_utils.adjust_cores_to_mb_target(
                            3500, resources.get("memory", "2G"), num_cores)
                        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                             2, "decrease").upper()
                        if dd.get_mark_duplicates(data):
                            cmd = _biobambam_merge_dedup_maxcov(data)
                        else:
                            cmd = _biobambam_merge_maxcov(data)
                        do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                None)
                        do.run('{} quickcheck -v {}'.format(samtools, tx_out_file),
                               "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, data["config"])
    bam.index(out_file, data["config"])
    return out_file
Пример #39
0
def _run_qualimap(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    if not os.path.exists(report_file):
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        resources = config_utils.get_resources("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem}")
        species = data["genome_resources"]["aliases"].get("ensembl", "").upper()
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = data["config"]["algorithm"].get("variant_regions")
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1])
    return _parse_qualimap_metrics(report_file)