示例#1
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in ["bwa", "tophat", "star"], "Disambiguation only supported for bwa, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                            os.pardir, "disambiguate_%s" % aligner))
    base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            Args = collections.namedtuple("Args", "A B output_dir intermediate_dir "
                                          "no_sort prefix aligner")
            args = Args(work_bam_a, work_bam_b, tx_out_dir, tx_out_dir,
                        True, "", aligner)
            disambiguate_main(args)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name,
       "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name,
       "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name,
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config)
    return [[data_a]]
示例#2
0
def run_cplusplus(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in ["bwa", "tophat", "star"], "Disambiguation only supported for bwa, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                            os.pardir, os.pardir, "disambiguate"))
    base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            raise NotImplementedError("Still need to test and support C++ version")
            cmd = ""
            do.run(cmd.format(**locals()), "Disambiguation", data_a)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name,
       "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name,
       "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name,
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config)
    return [[data_a]]
示例#3
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    if data["analysis"].startswith("wgbs-seq"):
        bismark_bam = dd.get_align_bam(data)
        sorted_bam = bam.sort(bismark_bam, data["config"])
        data = dd.set_align_bam(data, sorted_bam)
        data = dd.set_work_bam(data, bismark_bam)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"],
                                  data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(
                dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data),
                                                 data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1
                    and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(
                    dd.get_algorithm_qc(data)[0])
    return [[data]]
示例#4
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    shutil.move(fixed_count_file, count_file)

    return count_file
示例#5
0
def dedup_bismark(data):
    """Remove alignments to the same position in the genome from the Bismark
    mapping output using deduplicate_bismark
    """
    input_file = datadict.get_work_bam(data)
    input_file = bam.sort(input_file,
                          datadict.get_config(data),
                          order="queryname")
    sample_name = datadict.get_sample_name(data)
    output_dir = os.path.join(datadict.get_work_dir(data), 'dedup',
                              sample_name)
    output_dir = utils.safe_makedir(output_dir)

    input_file_name, input_file_extension = os.path.splitext(
        os.path.basename(input_file))
    output_file = os.path.join(
        output_dir, f'{input_file_name}.deduplicated{input_file_extension}')

    if utils.file_exists(output_file):
        data = datadict.set_work_bam(data, output_file)
        return [[data]]

    deduplicate_bismark = config_utils.get_program('deduplicate_bismark',
                                                   data['config'])
    command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}'
    with transaction.file_transaction(output_dir):
        do.run(command, 'remove deduplicate alignments')

    data = datadict.set_work_bam(data, output_file)
    return [[data]]
示例#6
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}"

    message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts"
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
示例#7
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = clean_ATAC(data)
    # for ATAC-seq, this will be the NF BAM
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
示例#8
0
def calculate_complexity_metrics(work_bam, data):
    """
    the work_bam should have duplicates marked but not removed
    mitochondrial reads should be removed 
    """
    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
    work_dir = dd.get_work_dir(data)
    metrics_dir = os.path.join(work_dir, "metrics", "atac")
    utils.safe_makedir(metrics_dir)
    metrics_file = os.path.join(
        metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv")
    # complexity metrics only make sense for paired-end reads
    if not bam.is_paired(work_bam):
        return data
    if utils.file_exists(metrics_file):
        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'],
                           metrics_file)
        return data
    # BAM file must be sorted by read name
    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
    with file_transaction(metrics_file) as tx_metrics_file:
        with open(tx_metrics_file, "w") as out_handle:
            out_handle.write("mt,m0,m1,m2\n")
        cmd = (
            f"{bedtools} bamtobed -bedpe -i {work_bam} | "
            "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
            "sort | "
            "uniq -c | "
            "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
            "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
            f"{tx_metrics_file}")
        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
        do.run(cmd, message)
    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
    return data
示例#9
0
def extract_NF_regions(data):
    """
    extract the nucleosome free regions from the work_bam. These regions will
    be < 100 bases
    """
    MAX_FRAG_LENGTH = 100
    sieve = config_utils.get_program("alignmentSieve", data)
    work_bam = dd.get_work_bam(data)
    num_cores = dd.get_num_cores(data)
    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
    if file_exists(out_file):
        data["NF_bam"] = out_file
        return data

    with file_transaction(out_file) as tx_out_file, \
         file_transaction(log_file) as tx_log_file:
        tx_unsorted_bam = tx_out_file + ".unsorted"
        cmd = (
            f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
            f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
            f"--minMappingQuality 10 "
            f"--filterMetrics {tx_log_file} ")
        do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
        tx_out_file = bam.sort(tx_unsorted_bam)

    data["NF_bam"] = out_file
    return data
示例#10
0
def _get_files(data):
    in_file = bam.sort(data["work_bam"], data["config"], order="queryname")
    gtf_file = data["genome_resources"]["rnaseq"]["transcripts"]
    work_dir = data["dirs"].get("work", "work")
    out_dir = os.path.join(work_dir, "htseq-count")
    out_file = os.path.join(out_dir, data['rgnames']['sample']) + ".counts"
    stats_file = os.path.join(out_dir, data['rgnames']['sample']) + ".stats"
    return in_file, gtf_file, out_file, stats_file
示例#11
0
def _prepare_bam_file(bam_file, tmp_dir, config):
    """
    Pipe sort by name cmd in case sort by coordinates
    """
    sort_mode = _get_sort_order(bam_file, config)
    if sort_mode != "queryname":
        bam_file = sort(bam_file, config, "queryname")
    return bam_file
示例#12
0
def _prepare_bam_file(bam_file, tmp_dir, config):
    """
    Pipe sort by name cmd in case sort by coordinates
    """
    sort_mode = _get_sort_order(bam_file, config)
    if sort_mode != "queryname":
        bam_file = sort(bam_file, config, "queryname")
    return bam_file
示例#13
0
def _get_files(data):
    mapped = bam.mapped(data["work_bam"], data["config"])
    in_file = bam.sort(mapped, data["config"], order="queryname")
    gtf_file = data["genome_resources"]["rnaseq"]["transcripts"]
    work_dir = data["dirs"].get("work", "work")
    out_dir = os.path.join(work_dir, "htseq-count")
    out_file = os.path.join(out_dir, data['rgnames']['sample']) + ".counts"
    stats_file = os.path.join(out_dir, data['rgnames']['sample']) + ".stats"
    return in_file, gtf_file, out_file, stats_file
示例#14
0
def _get_files(data):
    mapped = bam.mapped(data["work_bam"], data["config"])
    in_file = bam.sort(mapped, data["config"], order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    sample_name = dd.get_sample_name(data)
    out_file = os.path.join(out_dir, sample_name + ".counts")
    stats_file = os.path.join(out_dir, sample_name + ".stats")
    return in_file, gtf_file, out_file, stats_file
示例#15
0
def _get_files(data):
    mapped = bam.mapped(data["work_bam"], data["config"])
    in_file = bam.sort(mapped, data["config"], order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    sample_name = dd.get_sample_name(data)
    out_file = os.path.join(out_dir, sample_name + ".counts")
    stats_file = os.path.join(out_dir, sample_name + ".stats")
    return in_file, gtf_file, out_file, stats_file
示例#16
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data) or dd.get_align_bam(data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    if dd.get_aligner(data) == "star":
        out_dir = os.path.join(
            out_dir,
            "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data)))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    resources = config_utils.get_resources("featureCounts", data["config"])
    if resources:
        options = resources.get("options")
        if options:
            cmd += " %s" % " ".join([str(x) for x in options])

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
示例#17
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        in_bam = tz.get_in(("atac", "align", "NF"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count)
    return [[data]]
示例#18
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in ["bwa", "hisat2", "tophat", "star"], "Disambiguation only supported for bwa, hisat2, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                                    os.pardir, os.pardir,
                                                                    "disambiguate_%s" % aligner)))
        out_dir = os.path.join(base_dir, "_".join([str(x) for x in data_a["align_split"].split("-")]))
    else:
        out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                os.pardir, "disambiguate_%s" % aligner))
    base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            _run_cplusplus(work_bam_a, work_bam_b, tx_out_dir, aligner, os.path.basename(base_name), items)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: bam.sort("%s.disambiguatedSpeciesB.bam" % base_name, config),
       "%s-ambiguous" % data_a["genome_build"]: bam.sort("%s.ambiguousSpeciesA.bam" % base_name, config),
       "%s-ambiguous" % data_b["genome_build"]: bam.sort("%s.ambiguousSpeciesB.bam" % base_name, config),
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config)
    return [[data_a]]
示例#19
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in ["bwa", "hisat2", "tophat", "star"], "Disambiguation only supported for bwa, hisat2, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                                    os.pardir, os.pardir,
                                                                    "disambiguate_%s" % aligner)))
        out_dir = os.path.join(base_dir, "_".join([str(x) for x in data_a["align_split"].split("-")]))
    else:
        out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                os.pardir, "disambiguate_%s" % aligner))
    base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            _run_cplusplus(work_bam_a, work_bam_b, tx_out_dir, aligner, os.path.basename(base_name), items)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: bam.sort("%s.disambiguatedSpeciesB.bam" % base_name, config),
       "%s-ambiguous" % data_a["genome_build"]: bam.sort("%s.ambiguousSpeciesA.bam" % base_name, config),
       "%s-ambiguous" % data_b["genome_build"]: bam.sort("%s.ambiguousSpeciesB.bam" % base_name, config),
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config)
    return [[data_a]]
示例#20
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in [
        "bwa", "tophat", "star"
    ], "Disambiguation only supported for bwa, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    out_dir = os.path.normpath(
        os.path.join(os.path.dirname(work_bam_a), os.pardir,
                     "disambiguate_%s" % aligner))
    base_name = os.path.join(out_dir,
                             os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(out_dir) as tx_out_dir:
            Args = collections.namedtuple(
                "Args", "A B output_dir intermediate_dir "
                "no_sort prefix aligner")
            args = Args(work_bam_a, work_bam_b, tx_out_dir, tx_out_dir, True,
                        "", aligner)
            disambiguate_main(args)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name,
       "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name,
       "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name,
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name,
                                  config)
    return [[data_a]]
示例#21
0
def run_cplusplus(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in [
        "bwa", "hisat2", "tophat", "star"
    ], "Disambiguation only supported for bwa, hisat2, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    out_dir = os.path.normpath(
        os.path.join(os.path.dirname(work_bam_a), os.pardir, os.pardir,
                     "disambiguate"))
    base_name = os.path.join(out_dir,
                             os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            raise NotImplementedError(
                "Still need to test and support C++ version")
            cmd = ""
            do.run(cmd.format(**locals()), "Disambiguation", data_a)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: "%s.disambiguatedSpeciesB.bam" % base_name,
       "%s-ambiguous" % data_a["genome_build"]: "%s.ambiguousSpeciesA.bam" % base_name,
       "%s-ambiguous" % data_b["genome_build"]: "%s.ambiguousSpeciesB.bam" % base_name,
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name,
                                  config)
    return [[data_a]]
示例#22
0
def _align_from_fastq(fastq1, fastq2, aligner, align_ref, sam_ref, names, align_dir, data):
    """Align from fastq inputs, producing sorted BAM output.
    """
    config = data["config"]
    align_fn = TOOLS[aligner].align_fn
    out = align_fn(fastq1, fastq2, align_ref, names, align_dir, data)
    # handle align functions that update the main data dictionary in place
    if isinstance(out, dict):
        assert "work_bam" in out
        return out
    # handle output of raw SAM files that need to be converted to BAM
    else:
        work_bam = bam.sam_to_bam(out, config)
        data["work_bam"] = bam.sort(work_bam, config)
        return data
示例#23
0
def _align_from_fastq(fastq1, fastq2, aligner, align_ref, sam_ref, names,
                      align_dir, data):
    """Align from fastq inputs, producing sorted BAM output.
    """
    config = data["config"]
    align_fn = TOOLS[aligner].align_fn
    out = align_fn(fastq1, fastq2, align_ref, names, align_dir, data)
    # handle align functions that update the main data dictionary in place
    if isinstance(out, dict):
        assert "work_bam" in out
        return out
    # handle output of raw SAM files that need to be converted to BAM
    else:
        work_bam = bam.sam_to_bam(out, config)
        data["work_bam"] = bam.sort(work_bam, config)
        return data
示例#24
0
def _run_meth_extractor(bam_in, sample, workdir, index_dir, config):
    """Run bismark_methylation_extractor command"""
    bismark = config_utils.get_program("bismark_methylation_extractor", config)
    cores = config["algorithm"].get('cores', 1)
    memory = config["algorithm"].get('mem', 5)
    bam_in = bam.sort(bam_in, config, order="queryname")
    cmd = "{bismark} --no_overlap --comprehensive --cytosine_report --genome_folder {index_dir} --merge_non_CpG --multicore {cores} --buffer_size {memory}G --bedGraph --gzip {bam_in}"
    out_dir = os.path.join(workdir, sample)
    mbias_file = os.path.join(
        out_dir,
        os.path.basename(splitext_plus(bam_in)[0]) + '.M-bias.txt')
    if not file_exists(mbias_file):
        with tx_tmpdir() as tx_dir:
            with chdir(tx_dir):
                do.run(cmd.format(**locals()),
                       "bismark_methylation_extractor  in %s" % bam_in)
                shutil.move(tx_dir, out_dir)
    assert os.path.exists(
        mbias_file), "mbias report doesn't exists:%s" % mbias_file
    return mbias_file
示例#25
0
def _fix_unmapped(unmapped_file, config, names):
    """
    the unmapped.bam file from Tophat 2.0.9 is missing some things
    1) the RG tag is missing from the reads
    2) MAPQ is set to 255 instead of 0
    3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly
    """
    out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam"
    if file_exists(out_file):
        return out_file
    picard = broad.runner_from_config(config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    fixed = bam.sort(rg_fixed, config, "queryname")
    with closing(pysam.Samfile(fixed)) as work_sam:
        with file_transaction(out_file) as tx_out_file:
            tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam)
            for read1 in work_sam:
                if not read1.is_paired:
                    if read1.is_unmapped:
                        read1.mapq = 0
                    tx_out.write(read1)
                    continue
                read2 = work_sam.next()
                if read1.qname != read2.qname:
                    continue
                if read1.is_unmapped and not read2.is_unmapped:
                    read1.mapq = 0
                    read1.tid = read2.tid
                if not read1.is_unmapped and read2.is_unmapped:
                    read2.mapq = 0
                    read2.tid = read1.tid
                if read1.is_unmapped and read2.is_unmapped:
                    read1.mapq = 0
                    read2.mapq = 0
                    read1.mate_is_unmapped = True
                    read2.mate_is_unmapped = True
                tx_out.write(read1)
                tx_out.write(read2)
            tx_out.close()

    return out_file
示例#26
0
def clean_ATAC(data):
    """
    extract the nucleosome free regions from the work_bam. These regions will
    be < 100 bases. This also shifts the alignments for ATAC-seq.
    """
    MAX_FRAG_LENGTH = 100
    sieve = config_utils.get_program("alignmentSieve", data)
    work_bam = dd.get_work_bam(data)
    num_cores = dd.get_num_cores(data)
    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
    logger.info(
        f"Selecting nucleosome free regions from {work_bam} and saving as {out_file}."
    )
    if utils.file_exists(out_file):
        data["full_bam"] = work_bam
        data["work_bam"] = out_file
        return data

    unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam"
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file, \
            file_transaction(log_file) as tx_log_file:
            tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam"
            cmd = (
                f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} --ATACshift "
                f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
                f"--minMappingQuality 10 "
                f"--filterMetrics {tx_log_file} ")
            do.run(
                cmd,
                f"Extract NF regions from {work_bam} to {tx_unsorted_file}.")
            # shifting can cause the file to become unsorted
            sorted_file = bam.sort(tx_unsorted_file,
                                   dd.get_config(data),
                                   force=True)
            shutil.move(sorted_file, tx_out_file)
    bam.index(out_file, dd.get_config(data))
    data["full_bam"] = work_bam
    data["work_bam"] = out_file
    return data
示例#27
0
def _fix_unmapped(unmapped_file, config, names):
    """
    the unmapped.bam file from Tophat 2.0.9 is missing some things
    1) the RG tag is missing from the reads
    2) MAPQ is set to 255 instead of 0
    3) for reads where both are unmapped, the mate_is_unmapped flag is not set correctly
    """
    out_file = os.path.splitext(unmapped_file)[0] + "_fixed.bam"
    if file_exists(out_file):
        return out_file
    picard = broad.runner_from_config(config)
    rg_fixed = picard.run_fn("picard_fix_rgs", unmapped_file, names)
    fixed = bam.sort(rg_fixed, config, "queryname")
    with closing(pysam.Samfile(fixed)) as work_sam:
        with file_transaction(config, out_file) as tx_out_file:
            tx_out = pysam.Samfile(tx_out_file, "wb", template=work_sam)
            for read1 in work_sam:
                if not read1.is_paired:
                    if read1.is_unmapped:
                        read1.mapq = 0
                    tx_out.write(read1)
                    continue
                read2 = work_sam.next()
                if read1.qname != read2.qname:
                    continue
                if read1.is_unmapped and not read2.is_unmapped:
                    read1.mapq = 0
                    read1.tid = read2.tid
                if not read1.is_unmapped and read2.is_unmapped:
                    read2.mapq = 0
                    read2.tid = read1.tid
                if read1.is_unmapped and read2.is_unmapped:
                    read1.mapq = 0
                    read2.mapq = 0
                    read1.mate_is_unmapped = True
                    read2.mate_is_unmapped = True
                tx_out.write(read1)
                tx_out.write(read2)
            tx_out.close()

    return out_file
示例#28
0
def shift_ATAC(data):
    """
    shift the ATAC-seq alignments
    """
    MAX_FRAG_LENGTH = 100
    sieve = config_utils.get_program("alignmentSieve", data)
    work_bam = dd.get_work_bam(data)
    num_cores = dd.get_num_cores(data)
    out_file = os.path.splitext(work_bam)[0] + "-shifted.bam"
    log_file = os.path.splitext(work_bam)[0] + "-shifted.log"
    if utils.file_exists(out_file):
        data["work_bam"] = out_file
        return data

    unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam"
    # shifting removes all reads if the BAM file is not paired
    shiftflag = "--ATACshift" if bam.is_paired(work_bam) else ""
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file, \
            file_transaction(log_file) as tx_log_file:
            tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam"
            cmd = (
                f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} "
                f"{shiftflag} "
                f"--numberOfProcessors {num_cores} --maxFragmentLength 0 "
                f"--minFragmentLength 0 "
                f"--minMappingQuality 10 "
                f"--filterMetrics {tx_log_file} ")
            do.run(
                cmd,
                f"Shifting ATAC-seq alignments in {work_bam} to {tx_unsorted_file}."
            )
            # shifting can cause the file to become unsorted
            sorted_file = bam.sort(tx_unsorted_file,
                                   dd.get_config(data),
                                   force=True)
            shutil.move(sorted_file, tx_out_file)
    bam.index(out_file, dd.get_config(data))
    data["work_bam"] = out_file
    return data
示例#29
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = shift_ATAC(data)
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    # an unfiltered BAM file is useful for calculating some metrics later
    data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam)
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
    if method == "atac":
        data = atac.split_ATAC(data)
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
示例#30
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            cmd = [sys.executable, config_utils.get_program("tophat", config)]
            for k, v in options.items():
                if v is True:
                    cmd.append("--%s" % k)
                else:
                    assert not isinstance(v, bool)
                    cmd.append("--%s=%s" % (k, v))
            # tophat requires options before arguments, otherwise it silently ignores them
            cmd += files
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file))
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
示例#31
0
def _get_sam_file(data):
    in_file = data["work_bam"]
    config = data["config"]
    sorted = bam.sort(in_file, config, "queryname")
    sam = bam.bam_to_sam(sorted, config)
    return sam
示例#32
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items

    # Construct name of sorted input files
    work_bam_a_nsorted = os.path.splitext(
        data_a["work_bam"])[0] + '.nsorted.bam'
    work_bam_b_nsorted = os.path.splitext(
        data_b["work_bam"])[0] + '.nsorted.bam'

    # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted))
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(
            os.path.normpath(
                os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir,
                             os.pardir, "disambiguate_%s" % aligner)))
        logger.info(
            'Disambiguate prep of prepped work bam BAM {} with base dir {}'.
            format(work_bam_a_nsorted, base_dir))
        split_name = "_".join(
            [str(x) for x in data_a["align_split"].split("-")])
        out_dir = os.path.join(base_dir, split_name)
        logger.info(
            'Disambiguate prep of prepped work bam BAM {} with out dir {}'.
            format(work_bam_a_nsorted, out_dir))
    else:
        out_dir = os.path.normpath(
            os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir,
                         "disambiguate_%s" % aligner))

    base_name = os.path.join(
        out_dir,
        os.path.splitext(os.path.basename(work_bam_a_nsorted))[0])
    logger.info(
        'Disambiguate prep of prepped work bam BAM {} with base name {}'.
        format(work_bam_a_nsorted, base_name))

    summary_file = "%s_summary.txt" % base_name
    explant_bam = "%s.explant.sorted.bam" % base_name
    ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name
    work_bam = "%s.human.sorted.bam" % base_name

    logger.info('Disambiguate prep with work bam {}'.format(work_bam))

    logger.info(
        'Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}'
        .format(summary_file, explant_bam, ambiguous_bam, work_bam))

    if not utils.file_exists(summary_file) or not utils.file_exists(
            explant_bam) or not utils.file_exists(
                ambiguous_bam) or not utils.file_exists(work_bam):
        logger.info(
            'Disambiguating work bam a {} since outputs are not already existing'
            .format(work_bam_a_nsorted))
        work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
        work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
        logger.info('Disambiguate run with work bam a {}'.format(work_bam_a))
        logger.info('Disambiguate run with work bam b {}'.format(work_bam_b))
        with file_transaction(items[0], out_dir) as tx_out_dir:
            logger.info(
                'Disambiguate run with sorted prep work bam a {} and tx out dir {}'
                .format(work_bam_a_nsorted, tx_out_dir))
            tmp_base_name = os.path.join(tx_out_dir,
                                         os.path.basename(base_name))
            logger.info(
                'Disambiguate run with sorted prep work bam a {} and tmp_base_name {}'
                .format(work_bam_a_nsorted, tmp_base_name))
            pdx_filter = PDXFilter(
                work_bam_a,
                work_bam_b,
                "%s.human.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s.explant.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s.ambiguous.bam" % tmp_base_name,
                # Must be bam else it will not be merged
                "%s_summary.txt" % tmp_base_name,
                hard_filter=True,
                debug=True)
            pdx_filter.run()

        # Perhaps this can be removed since it has been fixed in bcbio
        if data_a.get("align_split"):
            split_dir = os.path.join(out_dir, split_name)
            logger.info(
                'Disambiguate post-run with sorted prep work bam a {} and split dir {}'
                .format(work_bam_a_nsorted, split_dir))
            if os.path.isdir(split_dir):
                for tmp_file in os.listdir(split_dir):
                    logger.info(
                        'Disambiguate post-run with sorted prep work bam a {} aiming to move file {}'
                        .format(work_bam_a_nsorted, tmp_file))
                    src = os.path.join(split_dir, tmp_file)
                    if os.path.isfile(src):
                        dest = os.path.join(out_dir, tmp_file)
                        logger.info(
                            'Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}'
                            .format(work_bam_a_nsorted, tmp_file, src, dest))
                        shutil.move(src, dest)
                shutil.rmtree(split_dir)

        try:
            if work_bam_a != data_a["work_bam"]:
                os.remove(work_bam_a)
        except:
            pass
        try:
            if work_bam_b != data_b["work_bam"]:
                os.remove(work_bam_b)
        except:
            pass

    else:
        logger.info(
            'Skipping disambiguation for work bam a {} since outputs are already existing'
            .format(work_bam_a_nsorted))

    explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort(
        "%s.explant.bam" % base_name, config)
    ambiguous_bam = os.path.isfile(
        ambiguous_bam) and ambiguous_bam or bam.sort(
            "%s.ambiguous.bam" % base_name, config)
    work_bam = os.path.isfile(work_bam) and work_bam or bam.sort(
        "%s.human.bam" % base_name, config)
    # logger.info('Disambiguate run with post work_bam {}'.format(work_bam))

    data_a["disambiguate"] = {
        data_b["genome_build"]: explant_bam,
        "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam,
        "summary": summary_file
    }
    data_a["work_bam"] = work_bam
    try:
        os.remove("%s.explant.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.human.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.ambiguous.bam" % base_name)
    except:
        pass

    return [[data_a]]
示例#33
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items

    # Construct name of sorted input files
    work_bam_a_nsorted = os.path.splitext(data_a["work_bam"])[0] + '.nsorted.bam'
    work_bam_b_nsorted = os.path.splitext(data_b["work_bam"])[0] + '.nsorted.bam'

    # logger.info('Disambiguate prep of input BAM {} and {}'.format(work_bam_a_nsorted, work_bam_b_nsorted))
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(os.path.normpath(
            os.path.join(os.path.dirname(work_bam_a_nsorted), os.pardir, os.pardir,
                         "disambiguate_%s" % aligner)))
        logger.info('Disambiguate prep of prepped work bam BAM {} with base dir {}'.format(work_bam_a_nsorted, base_dir))
        split_name = "_".join([str(x) for x in data_a["align_split"].split("-")])
        out_dir = os.path.join(base_dir, split_name)
        logger.info('Disambiguate prep of prepped work bam BAM {} with out dir {}'.format(work_bam_a_nsorted, out_dir))
    else:
        out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a_nsorted),
                                                os.pardir,
                                                "disambiguate_%s" % aligner))

    base_name = os.path.join(out_dir,
                             os.path.splitext(os.path.basename(work_bam_a_nsorted))[0])
    logger.info('Disambiguate prep of prepped work bam BAM {} with base name {}'.format(work_bam_a_nsorted, base_name))

    summary_file = "%s_summary.txt" % base_name
    explant_bam = "%s.explant.sorted.bam" % base_name
    ambiguous_bam = "%s.ambiguous.sorted.bam" % base_name
    work_bam = "%s.human.sorted.bam" % base_name

    logger.info('Disambiguate prep with work bam {}'.format(work_bam))

    logger.info('Deciding if disambiguation is required. Checking for existence of {}, {}, {} and {}'.format(summary_file, explant_bam, ambiguous_bam, work_bam))

    if not utils.file_exists(summary_file) or not utils.file_exists(explant_bam) or not utils.file_exists(ambiguous_bam) or not utils.file_exists(work_bam):
        logger.info('Disambiguating work bam a {} since outputs are not already existing'.format(work_bam_a_nsorted))
        work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
        work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
        logger.info('Disambiguate run with work bam a {}'.format(work_bam_a))
        logger.info('Disambiguate run with work bam b {}'.format(work_bam_b))
        with file_transaction(items[0], out_dir) as tx_out_dir:
            logger.info('Disambiguate run with sorted prep work bam a {} and tx out dir {}'.format(work_bam_a_nsorted, tx_out_dir))
            tmp_base_name = os.path.join(tx_out_dir, os.path.basename(base_name))
            logger.info('Disambiguate run with sorted prep work bam a {} and tmp_base_name {}'.format(work_bam_a_nsorted, tmp_base_name))
            pdx_filter = PDXFilter(work_bam_a, work_bam_b,
                                   "%s.human.bam" % tmp_base_name,
                                   # Must be bam else it will not be merged
                                   "%s.explant.bam" % tmp_base_name,
                                   # Must be bam else it will not be merged
                                   "%s.ambiguous.bam" % tmp_base_name,
                                   # Must be bam else it will not be merged
                                   "%s_summary.txt" % tmp_base_name,
                                   hard_filter=True,
                                   debug=True)
            pdx_filter.run()

        # Perhaps this can be removed since it has been fixed in bcbio
        if data_a.get("align_split"):
            split_dir = os.path.join(out_dir, split_name)
            logger.info('Disambiguate post-run with sorted prep work bam a {} and split dir {}'.format(work_bam_a_nsorted, split_dir))
            if os.path.isdir(split_dir):
                for tmp_file in os.listdir(split_dir):
                    logger.info('Disambiguate post-run with sorted prep work bam a {} aiming to move file {}'.format(work_bam_a_nsorted, tmp_file))
                    src = os.path.join(split_dir, tmp_file)
                    if os.path.isfile(src):
                        dest = os.path.join(out_dir, tmp_file)
                        logger.info('Disambiguate post-run with sorted prep work bam a {} moving file {} from {} to {}'.format(work_bam_a_nsorted, tmp_file, src, dest))
                        shutil.move(src, dest)
                shutil.rmtree(split_dir)

        try:
            if work_bam_a != data_a["work_bam"]:
                os.remove(work_bam_a)
        except:
            pass
        try:
            if work_bam_b != data_b["work_bam"]:
                os.remove(work_bam_b)
        except:
            pass

    else:
        logger.info('Skipping disambiguation for work bam a {} since outputs are already existing'.format(work_bam_a_nsorted))

    explant_bam = os.path.isfile(explant_bam) and explant_bam or bam.sort("%s.explant.bam" % base_name, config)
    ambiguous_bam = os.path.isfile(ambiguous_bam) and ambiguous_bam or bam.sort("%s.ambiguous.bam" % base_name, config)
    work_bam = os.path.isfile(work_bam) and work_bam or bam.sort("%s.human.bam" % base_name, config)
    # logger.info('Disambiguate run with post work_bam {}'.format(work_bam))

    data_a["disambiguate"] = {data_b["genome_build"]: explant_bam,
                              "%s-ambiguous" % data_a["genome_build"]: ambiguous_bam,
                              "summary": summary_file}
    data_a["work_bam"] = work_bam
    try:
        os.remove("%s.explant.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.human.bam" % base_name)
    except:
        pass
    try:
        os.remove("%s.ambiguous.bam" % base_name)
    except:
        pass

    return [[data_a]]
示例#34
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, config)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "%s.sam" % out_base)
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.sam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-convert-bam"] = True
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            tophat_runner = sh.Command(
                config_utils.get_program("tophat", config))
            ready_options = {}
            for k, v in options.iteritems():
                ready_options[k.replace("-", "_")] = v
            # tophat requires options before arguments,
            # otherwise it silently ignores them
            tophat_ready = tophat_runner.bake(**ready_options)
            cmd = str(tophat_ready.bake(*files))
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file),
                   None)
        _fix_empty_readnames(out_file)
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.sam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed = merge_unmapped(fixed, unmapped, config)
    fixed = _fix_unmapped(fixed, config, names)
    fixed = bam.sort(fixed, config)
    fixed = bam.bam_to_sam(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
示例#35
0
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, config)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError("Tophat versions < 2.0 are not supported, please "
                                  "download the newest version of Tophat here: "
                                  "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.sam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file,
                                                        ref_file, out_base,
                                                        tx_out_dir, data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-convert-bam"] = True
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            tophat_runner = sh.Command(config_utils.get_program("tophat",
                                                                config))
            ready_options = {}
            for k, v in options.iteritems():
                ready_options[k.replace("-", "_")] = v
            # tophat requires options before arguments,
            # otherwise it silently ignores them
            tophat_ready = tophat_runner.bake(**ready_options)
            cmd = str(tophat_ready.bake(*files))
            do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None)
        _fix_empty_readnames(out_file)
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed = merge_unmapped(fixed, unmapped, config)
    fixed = _fix_unmapped(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_config(config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", out_file, data["sam_ref"],
                          os.path.splitext(out_file)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out