Пример #1
0
def call_consensus(samples):
    """
    call consensus peaks on the narrow/Broad peakfiles from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                    break
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
                    break
        elif dd.get_chip_method(data) == "atac":
            for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    for data in dd.sample_data_iterator(samples):
        new_samples.append([
            tz.assoc_in(data, ("peaks_files", "consensus"),
                        {"main": consensusfile})
        ])
    return new_samples
Пример #2
0
def calling(data):
    """Main function to parallelize peak calling."""
    method = dd.get_chip_method(data)
    caller_fn = get_callers()[data["peak_fn"]]
    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                            dd.get_chip_method(data), data["resources"], data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
        if greylistdir:
            data["greylist"] = greylistdir
    if method == "atac":
        fractions = list(ATACRanges.keys()) + ["full"]
        for fraction in fractions:
            MIN_READS_TO_CALL = 1000
            chip_bam = tz.get_in(("atac", "align", fraction), data)
            if not bam.has_nalignments(chip_bam, MIN_READS_TO_CALL, data):
                logger.warn(f"{chip_bam} has less than {MIN_READS_TO_CALL}, peak calling will fail so skip this fraction.")
                continue
            logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
            name = dd.get_sample_name(data) + f"-{fraction}"
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
            out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
                                  dd.get_chip_method(data), data["resources"], data)
            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]
Пример #3
0
def calling(data):
    """Main function to parallelize peak calling."""
    method = dd.get_chip_method(data)
    caller_fn = get_callers()[data["peak_fn"]]
    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam,
                              dd.get_genome_build(data), out_dir,
                              dd.get_chip_method(data), data["resources"],
                              data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
        if greylistdir:
            data["greylist"] = greylistdir
    if method == "atac":
        for fraction in atac.ATACRanges.keys():
            chip_bam = tz.get_in(("atac", "align", fraction), data)
            logger.info(
                f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}."
            )
            name = dd.get_sample_name(data) + f"-{fraction}"
            out_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
            out_files = caller_fn(name, chip_bam, None,
                                  dd.get_genome_build(data), out_dir,
                                  dd.get_chip_method(data), data["resources"],
                                  data)
            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]
Пример #4
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Пример #5
0
def _maybe_add_peaks(algorithm, sample, out):
    out_dir = sample.get("peaks_files", {})
    if dd.get_chip_method(sample) == "atac":
        for files in out_dir.values():
            for caller in files:
                if caller == "main":
                    continue
                for fn in files[caller]:
                    if os.path.exists(fn):
                        out.append({
                            "path": fn,
                            "dir": caller,
                            "ext": utils.splitext_plus(fn)[1]
                        })
    else:
        for caller in out_dir:
            if caller == "main":
                continue
            for fn in out_dir[caller]:
                if os.path.exists(fn):
                    out.append({
                        "path": fn,
                        "dir": caller,
                        "ext": utils.splitext_plus(fn)[1]
                    })
    return out
Пример #6
0
def run_ataqv(data):
    if not dd.get_chip_method(data) == "atac":
        return None
    work_dir = dd.get_work_dir(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join(work_dir, "qc", sample_name, "ataqv")
    peak_file = get_full_peaks(data)
    bam_file = get_unfiltered_bam(data)
    out_file = os.path.join(out_dir, sample_name + ".ataqv.json.gz")
    if not peak_file:
        logger.info(f"Full peak file for {sample_name} not found, skipping ataqv")
        return None
    if not bam_file:
        logger.info(f"Unfiltered BAM file for {sample_name} not found, skipping ataqv")
        return None
    if utils.file_exists(out_file):
        return out_file
    tss_bed_file = os.path.join(out_dir, "TSS.bed")
    tss_bed_file = gtf.get_tss_bed(dd.get_gtf_file(data), tss_bed_file, data, padding=1000)
    autosomal_reference = os.path.join(out_dir, "autosomal.txt")
    autosomal_reference = _make_autosomal_reference_file(autosomal_reference, data)
    ataqv = config_utils.get_program("ataqv", data)
    mitoname = chromhacks.get_mitochondrial_chroms(data)[0]
    if not ataqv:
        logger.info(f"ataqv executable not found, skipping running ataqv.")
        return None
    with file_transaction(out_file) as tx_out_file:
        cmd = (f"{ataqv} --peak-file {peak_file} --name {sample_name} --metrics-file {tx_out_file} "
               f"--tss-file {tss_bed_file} --autosomal-reference-file {autosomal_reference} "
               f"--ignore-read-groups --mitochondrial-reference-name {mitoname} "
               f"None {bam_file}")
        message = f"Running ataqv on {sample_name}."
        do.run(cmd, message)
    return out_file
Пример #7
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    if encode_bed:
        chip_bam = _prepare_bam(chip_bam, encode_bed, data['config'])
        data["work_bam_filter"] = chip_bam
        input_bam = _prepare_bam(input_bam, encode_bed, data['config'])
        data["input_bam_filter"] = input_bam
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data),
                          out_dir, dd.get_chip_method(data), data["resources"],
                          data["config"])
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
    data["input_bam_filter"] = input_bam
Пример #8
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = clean_ATAC(data)
    # for ATAC-seq, this will be the NF BAM
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Пример #9
0
def _normalized_bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools but produce normalized bigWig files"""
    cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
           "--binSize 20 --effectiveGenomeSize {size} "
           "--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
    size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    cores = dd.get_num_cores(data)
    try:
        bam_coverage = config_utils.get_program("bamCoverage", data)
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
    method = dd.get_chip_method(data)
    cmd += "--normalizeUsing CPM "
    toignore = get_mitochondrial_chroms(data)
    if toignore:
        ignorenormflag = f"--ignoreForNormalization {' '.join(toignore)} "
        cmd += ignorenormflag
    resources = config_utils.get_resources("bamCoverage", data["config"])
    if resources:
        options = resources.get("options")
        if options:
            cmd += " %s" % " ".join([str(x) for x in options])
    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
    if utils.file_exists(bw_output):
        return bw_output
    with file_transaction(bw_output) as out_tx:
        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
    return bw_output
Пример #10
0
def call_consensus(samples):
    """
    call consensus peaks on the narrowPeak files from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
        elif dd.get_chip_method(data) == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
            else:
                logger.info(
                    f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended."
                )
                for fn in tz.get_in(("peaks_files", "full", "macs2"), data,
                                    []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    if not utils.file_exists(consensusfile):
        logger.warning("No consensus peaks found.")
        return samples
    saffile = consensus_to_saf(consensusfile,
                               os.path.splitext(consensusfile)[0] + ".saf")
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peaks_files", "consensus"),
                           {"main": consensusfile})
        new_samples.append([data])
    return new_samples
Пример #11
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
    return [sample]
Пример #12
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = origin[0].get("work_bam")
            return [sample]
    return [sample]
Пример #13
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        in_bam = tz.get_in(("atac", "align", "NF"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count)
    return [[data]]
Пример #14
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
        if dd.get_chip_method(data) == "atac":
            to_run.append("ataqv")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if dd.has_variantcalls(data):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["peddy"]
            if "contamination" not in dd.get_tools_off(data):
                to_run += ["contamination"]
        if vcfutils.get_paired_phenotype(data):
            if "viral" not in dd.get_tools_off(data):
                to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Пример #15
0
def _macs2_cmd(data):
    """Main command for macs2 tool."""
    method = dd.get_chip_method(data)
    if method.lower() == "chip":
        cmd = ("{macs2} callpeak -t {chip_bam} -c {input_bam} {paired} "
               "{genome_size} -n {name} --bdg {options} ")
    elif method.lower() == "atac":
        cmd = ("{macs2} callpeak -t {chip_bam} --nomodel "
               " {paired} {genome_size} -n {name} --bdg {options}"
               " --nolambda --keep-dup all")
    else:
        raise ValueError("chip_method should be chip or atac.")
    return cmd
Пример #16
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
Пример #17
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name ))
    # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config'])
    # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config'])
    out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                         dd.get_chip_method(data), data["config"])
    data["peaks_file"] = out_file
    return [[data]]
Пример #18
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name ))
    # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config'])
    # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config'])
    out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                         dd.get_chip_method(data), data["config"])
    data["peaks_file"] = out_file
    return [[data]]
Пример #19
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    to_skip = 0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
        elif dd.get_chip_method(sample[0]).lower() == "atac":
            to_process += 1.0
        else:
            to_skip += 1.0
    return (to_process - to_skip) / len(samples)
Пример #20
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = data.get("work_bam")
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["resources"], data)
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
    # data["input_bam_filter"] = input_bam
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
Пример #21
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = data.get("work_bam")
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["resources"], data)
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
    # data["input_bam_filter"] = input_bam
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
Пример #22
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    to_skip = 0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
        elif dd.get_chip_method(sample[0]).lower() == "atac":
            to_process += 1.0
        else:
            to_skip += 1.0
    mult = (to_process - to_skip) / len(samples)
    if mult <= 0:
        mult = 1 / len(samples)
    return max(mult, 1)
Пример #23
0
def calculate_encode_complexity_metrics(data):
    metrics_file = tz.get_in(['atac', 'complexity_metrics_file'], data, None)
    if not metrics_file:
        return {}
    else:
        with open(metrics_file) as in_handle:
            header = next(in_handle).strip().split(",")
            values = next(in_handle).strip().split(",")
    raw_metrics = {h: int(v) for h, v in zip(header, values)}
    metrics = {"PBC1": raw_metrics["m1"] / raw_metrics["m0"],
               "NRF": raw_metrics["m0"] / raw_metrics["mt"]}
    if raw_metrics["m2"] == 0:
        PBC2 = 0
    else:
        PBC2 = raw_metrics["m1"] / raw_metrics["m2"]
    metrics["PBC2"] = PBC2

    if dd.get_chip_method(data) == "atac":
        metrics["bottlenecking"] = get_atac_bottlenecking_flag(metrics["PBC1"], metrics["PBC2"])
        metrics["complexity"] = get_atac_complexity_flag(metrics["NRF"])
    else:
        metrics["bottlenecking"] = get_chip_bottlenecking_flag(metrics["PBC1"], metrics["PBC2"])
        metrics["complexity"] = get_chip_complexity_flag(metrics["NRF"])
    return(metrics)
Пример #24
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = shift_ATAC(data)
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    # an unfiltered BAM file is useful for calculating some metrics later
    data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam)
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
    if method == "atac":
        data = atac.split_ATAC(data)
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]