Пример #1
0
def _star_sam2bam(workflow, conf):  # SAM -> BAM
    """
    convert SAM to BAM and use mapping quality as cutoff
    :param workflow: samflow defined class
    :param conf: parsed config file
    :return: void
    """
    import os
    for target in conf.sample_targets:
        sam2bam = attach_back(
            workflow,
            ShellCommand("""
                        ln -s {input[sam]} {output[sam]}
                        {tool} view -q 255 -bt {param[genome]} {input[sam]} -o {output[bam]}
                        """,
                         tool="samtools",
                         input={"sam": target + "Aligned.out.sam"},
                         output={
                             "bam": target + ".bam",
                             "sam": target + ".sam"
                         },
                         param={
                             "genome":
                             conf.get(conf.get("basics", "species"),
                                      "chrom_len"),
                         },
                         name="star sam2dam"))
        workflow.update(param=conf.items("sam2bam"))

        #From bwa/dc.py
        sam2bamnochrm = attach_back(
            workflow,  ## use mapping quality 1 defined by samtools official FAQ
            ShellCommand(
                """
                        awk \'BEGIN{{OFS="\\t"}} {{print $1,0,$2}}\' {param[genome]} > {param[chrom_bed]}
                        grep -v chrM {param[chrom_bed]} > {output[nochrmbed]}
                        {tool} view -h -b -L {output[nochrmbed]} {input[bam]} > {output[nochrmbam]}
                        {tool} view -h {output[nochrmbam]}  > {output[nochrmsam]}
                        {tool} view -h {input[bam]}  > {output[usam]}
                        """,
                tool="samtools",
                input={"bam": target + ".bam"},
                output={
                    "nochrmbed": target + ".nochrM",
                    "nochrmbam": target + "_nochrM.bam",
                    "usam":
                    target + "_u.sam",  ## uniquely mapping sam for sampling
                    "nochrmsam": target + "_nochrM.sam"
                },
                param={
                    "tmp_bam": target + ".tmp.bam",
                    "output_prefix": target,
                    "chrom_bed": os.path.join(conf.target_dir, "chrom.bed"),
                    "mapq": 1,
                    "genome": conf.get(conf.get("basics", "species"),
                                       "chrom_len")
                },
                name="filtering mapping and convert")
        )  # Use 5G memory as default
        sam2bamnochrm.update(param=conf.items("sam2bam"))
Пример #2
0
Файл: dc.py Проект: cfce/chilin
def merge_bams(workflow, conf):   ## merge input and chip bam
    """
    input multiple input and multiple control to merge into one file separately
    :return:
    """
    # merge all treatments into one
    merge_bams_treat = ShellCommand(
        "{tool} merge {output[merged]} {param[bams]}",
        tool="samtools",
        input=[target + ".bam" for target in conf.treatment_targets],
        output={"merged": conf.prefix + "_treatment.bam"})
    merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)}
    merge_bams_treat.allow_fail = True
    merge_bams_treat.allow_dangling = True

    if len(conf.treatment_targets) > 1:
        attach_back(workflow, merge_bams_treat)
    else:
        # when there's only one treatment sample, use copying instead of merging
        attach_back(workflow, make_link_command(merge_bams_treat.input[0], merge_bams_treat.output["merged"]))

    # merging step will be skipped if control sample does not exist
    # So be careful to check whether there are control samples before using `_control.bam`
    if len(conf.control_targets) > 1:
        merge_bams_control = merge_bams_treat.clone
        merge_bams_control.input = [target + ".bam" for target in conf.control_targets]
        merge_bams_control.output = {"merged": conf.prefix + "_control.bam"}
        merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)}
        attach_back(workflow, merge_bams_control)
    elif len(conf.control_targets) == 1:
        attach_back(workflow, make_link_command(conf.control_targets[0] + ".bam", conf.prefix + "_control.bam"))
Пример #3
0
def sample_bam_stat(workflow, conf, tex):
    """ sample non chrm bam to 15M for NSC and PBC
    sample non chrm bam to 5M for spot
    """
    for i, target in enumerate(conf.treatment_targets):
        ## for PE, use name sorted in order to calculate PBC
        input_bam = target + "_name_sorted.bam" if conf.pe else target + "_final_nochrm.bam"
        attach_back(workflow, ShellCommand(
            "{tool} {input[namesorted]} {param[run_spp]} {output[bamstat]} {output[sppstat]}  {param[pe]} {output[pbc]}",
            tool = "eap_dnase_stats",
            input = {"namesorted": input_bam},
            output = {"bamstat": target + "_bam_stat.qc",  ## 15M
                      "sppstat": target + "_spp.qc",
                      "pbc": target + "_final_nochrm_15M_pbc.qc"},
            param = {"pe": "pe" if conf.pe else "se",
                     "run_spp": conf.get("tool", "spp")}))

        if not "macs" in conf.get("tool", "peak_calling"):

            attach_back(workflow, ShellCommand(
                "{tool} {input[bamwithoutchrm]} {param[genome]} {param[readsize]} {output[spot]} {param[hotspot_dir]} {param[hotspot_output]} {param[hotspot_tmp]} {param[spot_tmp]}",
                tool = "dac_spot", ## 5M
                input = {"bamwithoutchrm": target + "_final_nochrm.bam"},
                output = {"spot": target + "_spot_nochrm_5M.qc"},

                param = {"genome": conf.species,
                         "spot_tmp": conf.hotspot_reps_tmp_prefix[i] + "_final_nochrm.bam.5000000.spot.out",
                         "readsize": conf.readsize,
                         "hotspot_dir": conf.get("tool", "peak_calling"),
                         "hotspot_output": target + "_hotspot",
                         "hotspot_tmp": target + "_hotspot_tmp"}))
Пример #4
0
def read_quality(workflow, conf, tex):
    if conf.pe:
        for raw, target in conf.treatment_pairs_pe:
            attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input[fastq][0]} {input[fastq][1]} {output[stat][0]} {output[stat][1]}",
                    tool="dac_pe_read_quality",
                    input={"fastq": raw},
                    output={"stat": [i + "_read_quality.qc" for i in target]}))

        # attach_back(workflow, PythonCommand(stat_fastqStat,
        #                                     input = {"seq": [ [ p + "_100k.seq" for p in target ] for target in conf.treatment_pair_data ]},
        #                                     output = {"json": conf.json_prefix + "_seq_quality.json"},
        #                                     param = {"samples": conf.treatment_bases, "seq_type": conf.pe}))
        # attach_back(workflow, PythonCommand(
        #     seq_quality_doc,
        #     input = {"tex": tex, "json": conf.json_prefix + "_seq_quality.json"},
        #     output = {"seq": conf.latex_prefix + "seq_quality.tex", "len": conf.latex_prefix + "len.tex"},
        #     param = {"seq_type": conf.seq_type, "reps": len(conf.treatment_pairs),
        #              "pe_samples": conf.treatment_bases}))
    else:
        for raw, target in conf.treatment_pairs:
            sample_fq = {"stat": target + "_read_quality.qc"}
            attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input} {output[stat]}",
                    tool="dac_se_read_quality",
                    input=raw,
                    output=sample_fq,
                    name="100k read sequence quality and sequence length"))
Пример #5
0
def _bwa(workflow, conf):
    """
    incorpate ENCODE ChIP-seq alignment parameters
    """
    for raw, target in conf.treatment_pairs:
        param = {"threads": conf.threads,
                 "index":conf.get(conf.species, "genome_index"),
                 "prefix": target + "_raw_sorted",
                 "qc2": target + "_rawbam_stats.qc"}

        if conf.pe:
            bwa = attach_back(workflow, ShellCommand(
                "{tool} {param[threads]} {param[index]} {input[fastq][0]} {input[fastq][1]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}",
                tool = "eap_run_bwa_pe",
                input = {"fastq": raw},
                output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"},
                param = param,
                name = "pair end mapping"))
        else:
            bwa = attach_back(workflow, ShellCommand(
                "{tool} {param[threads]} {param[index]} {input[fastq]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}",
                tool = "eap_run_bwa_se",
                input = {"fastq": raw},
                output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"},
                param = param,
                name = "single end mapping"))
        bwa.update(param = conf.items("bwa"))
Пример #6
0
def filter_bam(workflow, conf, tex):
    """ filter bam file by samtools and sample by ucsc app
    """

    for target in conf.treatment_targets:
        input = {"raw": target + "_raw_sorted.bam"}
        if conf.pe:
            name = "pair"
            tool = "dac_bam_pe_post_filter"
            param = {
                "mapq": 3,
                "namesortedbamprefix": target + "_name_sorted",
                "finalprefix": target + "_final",
                "qc2": target + "_filter_bam_stats.qc"
            }
            output = {
                "finalbam": target + "_final.bam",
                "namesortedbam": target + "_name_sorted.bam",
                "bamwithoutchrm": target + "_final_nochrm.bam",
                "qc": target + "_filter_bam.qc"
            }

            attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input[raw]} {param[namesortedbamprefix]} {output[namesortedbam]} {param[finalprefix]} {output[finalbam]} {param[mapq]} {output[bamwithoutchrm]} {output[qc]} {param[qc2]}",
                    tool=tool,
                    input=input,
                    output=output,
                    param=param,
                    name="%s end filtering" % name))
        else:
            name = "single"
            tool = "dac_bam_se_post_filter"
            param = {
                "mapq": 3,
                "finalprefix": target + "_final",
                "qc2": target + "_filter_bam_stats.qc"
            }
            output = {
                "finalbam": target + "_final.bam",
                "bamwithoutchrm": target + "_final_nochrm.bam",
                "qc": target + "_filter_bam.qc"
            }

            attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input[raw]} {output[finalbam]} {param[mapq]} {output[qc]} {output[bamwithoutchrm]} {param[finalprefix]} {param[qc2]}",
                    tool=tool,
                    input=input,
                    output=output,
                    param=param,
                    name="%s end filtering" % name))
Пример #7
0
def bowtie(workflow, conf):   # Mapping
    """
    Use bowtie to map reads to genome,
    call __bwa_sam2bam to convert sam to bam
    :param workflow: samflow defined class
    :param conf: parsed config files
    :return: void
    """
    for target in conf.sample_targets:
        bowtie = attach_back(workflow,
                          ShellCommand(
                              "{tool} -p {param[NUM_THREADS]} -S -m 1 {param[index]} {input[fastq]} {output[sam]}",
                              tool = "bowtie",
                              input = {"fastq": target + ".fastq"},
                              output = {"sam": target + ".sam"},
                              param = {"NUM_THREADS": conf.threads,
                                       ## judge chosen species from basics section
                                       "index": conf.get_path(conf.get("basics", "species"), "genome_index")},
                              name = "bowtie aln"))
        bowtie.update(param = conf.items("bowtie"))
        bowtie.allow_dangling = True
        bowtie.allow_fail = True

    _bowtie_sam2bam(workflow, conf)

    ## QC part--NOTE keeping the bwa legacy code!
    stat_bwa(workflow, conf)
    if conf.long:
        tex_bwa(workflow, conf)
Пример #8
0
def merge_latex(workflow, conf):

    ## begin and end of the docs
    latex_order = [
        "_begin.tex",
        "_summary_table.tex",
    ]
    if conf.long:
        latex_order += [
            "_fastqc.tex",
            "_fastqc_gc.tex",
            "_map.tex",
            "_conserv.tex",
            # "_macs2.latex", "_macs2_on_sample.latex",
            # "_phan.tex",
            "_motif.tex",
            "_contam.tex",
            "_frip.tex",
        ]
    latex_order.append("_end.tex")

    latex_list = [conf.latex_prefix + i for i in latex_order]
    merge_cmd = attach_back(
        workflow,
        ShellCommand("cat {param[tex]} > {output}",
                     output=conf.prefix + "_report.tex"))
    merge_cmd.allow_fail = True
    merge_cmd.param = {"tex": " ".join(latex_list)}
Пример #9
0
def replicates_peaks_overlap(workflow, conf):  # peaks bed from each replicate
    """
    :param workflow: class from samflow
    :param conf: external parsed config file
    :return: workflow through attach_back
    """
    for i in range(len(conf.treatment_targets)):
        for j in range(i + 1, len(conf.treatment_targets)):
            replicates_overlap = attach_back(
                workflow,
                ShellCommand(
                    "{tool} -f {param[p]} -a {input[0]} -b {input[1]} | wc -l > {output}",
                    tool="intersectBed",
                    input=[
                        conf.treatment_targets[i] + "_sort_peaks.narrowPeak"
                        if conf.get("macs2", "type").lower()
                        in ["both", "narrow"] else conf.treatment_targets[i] +
                        "_b_sort_peaks.broadPeak", conf.treatment_targets[j] +
                        "_sort_peaks.narrowPeak" if conf.get(
                            "macs2", "type").lower() in ["both", "narrow"] else
                        conf.treatment_targets[j] + "_b_sort_peaks.broadPeak"
                    ],
                    output=conf.prefix + "_%s_%s.overlap" % (i, j),
                    param={"p": 0.3},
                    name="Replicates peaks overlap QC"))
            replicates_overlap.allow_fail = True  # in case 0 peak in macs2
            replicates_overlap.allow_dangling = True
    ## generate a barplot for meta distribution

    replicates_overlap.update(param=conf.items("replicates"))
    return workflow
Пример #10
0
def PBC(workflow, conf):  # PBC1
    """
    Introduce ENCODE II library complexity assessment methods
    N1 / Nd, N1 is the location with exact one read, Nd is distinct location number
    :param workflow: samflow class
    :param conf: parsed config
    :return: void
    """
    for t in conf.sample_targets:
        pbc1 = attach_back(
            workflow,
            ShellCommand(
                """
                               bamToBed -i {input[bam]} | {tool} \'{{l[$1"\\t"$2"\\t"$3"\\t"$6]+=1}} END {{for(i in l) print l[i]}}\' \\
                                 | awk \'{{n[$1]+=1}} END {{for (i in n) print i"\\t"n[i]}}\'  \\
                                 | sort -k1n -  > {output[hist]}
                               awk '{{
                               if (NR==1) {{N1=$2}}
                               Nd+=$2
                               }} END {{print N1,Nd,N1/Nd}}' {output[hist]} > {output[pbc]}
                               """,
                tool="awk",
                input={"bam": t + "_4000000.bam" if conf.down else t + ".bam"},
                output={
                    "pbc": t + ".pbc",
                    "hist": t + ".hist"
                },
                name="PBC"))
        pbc1.allow_fail = True
        pbc1.allow_dangling = True

    ## QC part
    stat_pbc(workflow, conf)
Пример #11
0
def r_exec(jinja_template_r):
    ShellCommand("{tool} {input}",
        tool = "Rscript",
        name = 'Rscript',
        input=jinja_template_r.param["render_dump"],
        param={},
        output=jinja_template_r.param["pdf"]).invoke()
Пример #12
0
def Phan(workflow, conf):  # NSC, RSC, Qtag
    """
    for calculating NSC, RSC score at 4M level
    http://code.google.com/p/phantompeakqualtools/
    (1) Determine strand cross-correlation peak / predominant fragment length OR print out quality measures
        Rscript run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile>
    """
    # peaks calling by SPP needs control, for phantomqc, we do both treat and control independently

    for t in conf.sample_targets:
        if conf.down:  ## default, this option
            ibam = t + "_4000000.bam"
        # elif conf.unsc: ## --total --unsc
        #     ibam = t + "_rmdup.bam"
        else:  ## --total
            ibam = t + ".bam"
        attach_back(
            workflow,
            ShellCommand(
                "{tool} {param[script]} -c={input[chip]} -rf -savp -out={output[spp]} -odir={param[dir]}",
                tool="Rscript",
                input={"chip": ibam},
                output={
                    "spp": t + ".spp",
                    "pdf": t + "_4000000.pdf" if conf.down else t + ".pdf"
                },
                param={
                    "script": conf.get("tool", "spp"),
                    "dir": os.path.dirname(t + ".spp")
                },
                name="SPP"))

    stat_phan(workflow, conf)
    if conf.long:
        tex_phan(workflow, conf)
Пример #13
0
def DHS(workflow, conf):  # DHS overlap percentage
    """
    get peaks overlapping percentage with union DHS
    :param workflow: uniform pipeline workflow from samflow
    :param conf: parsed config files
    :return: workflow
    """
    peaks = conf.prefix + "_sort_peaks.narrowPeak" if conf.get(
        "macs2", "type") in ["both", "narrow"
                             ] else conf.prefix + "_b_sort_peaks.broadPeak"
    DHS = attach_back(
        workflow,
        ShellCommand("""
                                   n=$(head -n {param[p]} {input[MACS2_bed]} | wc -l)
                                   dhs=$(head -n {param[p]} {input[MACS2_bed]} | {tool} -wa -u -a - -b {input[DHS_peaks_bed]}|wc -l)
                                   ##dhs=$(echo \"scale=5;$dhs/$n\" | bc)
                                   echo $n,$dhs > {output}
                                   """,
                     tool="intersectBed",
                     input={
                         "MACS2_bed":
                         peaks,
                         "DHS_peaks_bed":
                         conf.get(conf.get("basics", "species"), "dhs")
                     },
                     output=conf.prefix + ".dhs",
                     param={"p": 5000},
                     name="intersect DHS"))
    DHS.allow_dangling = True
    DHS.allow_fail = True
Пример #14
0
def star(workflow, conf):  # Mapping
    """
    Use star to map reads to genome,
    call __bwa_sam2bam to convert sam to bam
    :param workflow: samflow defined class
    :param conf: parsed config files
    :return: void
    """
    for target in conf.sample_targets:
        star = attach_back(
            workflow,
            ShellCommand(
                "{tool} --genomeDir {param[index]} --runThreadN {param[NUM_THREADS]} --readFilesIn {input[fastq]} --outFileNamePrefix {param[prefix]}",
                tool="STAR",
                input={"fastq": target + ".fastq"},
                output={"sam": target + "Aligned.out.sam"},
                param={
                    "NUM_THREADS":
                    conf.threads,
                    "prefix":
                    target,
                    ## judge chosen species from basics section
                    "index":
                    conf.get_path(conf.get("basics", "species"),
                                  "genome_index")
                },
                name="star aln"))
        star.update(param=conf.items("bowtie"))

    _star_sam2bam(workflow, conf)

    ## QC part--NOTE keeping the bwa legacy code!
    stat_bwa(workflow, conf)
    if conf.long:
        tex_bwa(workflow, conf)
Пример #15
0
def fastqc(workflow, conf):
    """
    fastqc to extract gc contents(not yet) and median sequence quality
    :param workflow:
    :param conf:
    :return:
    """
    for raw, target in conf.sample_pairs:
        if conf.pe:
            fastqc_run = attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}",
                    ## only check one pair
                    input=target[0] + "_100k.fastq",
                    output={
                        "target_dir":
                        conf.target_dir,
                        "fastqc_summary":
                        target[0] + "_100k_fastqc/fastqc_data.txt"
                    },
                    tool="fastqc",
                    param={"threads": conf.threads},
                    name="fastqc"))
        else:
            fastqc_run = attach_back(
                workflow,
                ShellCommand(
                    "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}",
                    input=target + "_100k.fastq",
                    output={
                        "target_dir": conf.target_dir,
                        "fastqc_summary":
                        target + "_100k_fastqc/fastqc_data.txt"
                    },
                    tool="fastqc",
                    param={"threads": conf.threads},
                    name="fastqc"))
            fastqc_run.update(param=conf.items("fastqc"))
        fastqc.allow_fail = True
        fastqc.allow_dangling = True

    ## QC part of chilin
    ## use conf property conf.long = True
    stat_fastqc(workflow, conf)
    if conf.long:
        tex_fastqc(workflow, conf)
Пример #16
0
def hotspotv4(workflow, conf, tex):
    for target in conf.treatment_targets:
        hotspot=attach_back(workflow,
                    ShellCommand(
                        "{tool} {param[hotspot_dir]} {param[genome]} {input[bam]} {param[readsize]} {output[narrowbb]} {output[broadbb]} {output[bigwig]} {param[tmp]} {output[hotspot_output]} {input[narrowas]} {input[broadas]} {param[chromsize]} {output[narrow]} {output[broad]}",
                        tool = "eap_run_hotspot",
                        input = {"bam": target + "_final_nochrm.bam",
                                 "narrowas": narrow,
                                 "broadas": broad},
                        output = {"narrowbb": target + ".narrowPeak.bigBed",
                                  "broadbb": target + ".broadPeak.bigBed",
                                  "narrow": target + ".narrowPeak",
                                  # "qc1": target + ".narrowPeak.qc",
                                  # "qc2": target + ".broadPeak.qc",
                                  "broad": target + ".broadPeak",
                                  "bigwig": target + ".bigWig",
                                  "hotspot_output": target + "_hotspot"},
                        param = {"hotspot_dir": conf.get("tool", "peak_calling"),
                                 "genome": conf.species,
                                 "chromsize": conf.get(conf.species, "chrom_len"),
                                 "tmp": target + "_hotspot_peak_call_tmp",
                                 "readsize": 36}))
    have_treat_reps = len(conf.treatment_pairs) >= 2 ## replicates

    if have_treat_reps:
        eval_reps(workflow, conf, tex)
        catsam = attach_back(workflow, ShellCommand(
            "{tool} cat {param[bams]} > {output[bam]}",
            tool = "samtools",
            input ={"bams": [ target + "_final.bam" for target in conf.treatment_targets]},
            output = {"bam": conf.prefix + "_pool.bam"}))
        catsam.param.update(bams=' '.join(catsam.input["bams"]))
        hotspot_merge = hotspot.clone
        hotspot_merge.param.update(tmp=conf.prefix+"_hotspot_peak_call_tmp")
        hotspot_merge.input.update(bam = conf.prefix + "_pool.bam")
        hotspot_merge.output ={"narrowbb": conf.prefix + ".narrowPeak.bigBed",
                               "broadbb": conf.prefix + ".broadPeak.bigBed",
                               "narrow": conf.prefix + ".narrowPeak",
                               # "qc1": conf.prefix + ".narrowPeak.qc",
                               # "qc2": conf.prefix + ".broadPeak.qc",
                               "broad": conf.prefix + ".broadPeak",
                               "bigwig": conf.prefix + ".bigWig",
                               "hotspot_output": conf.prefix + "_hotspot"}
        attach_back(workflow, hotspot_merge)
Пример #17
0
def make_link_command(orig, dest):
    """
    link original input to destination files
    :param orig: input
    :param dest: link symbol
    :return: ShellCommand Class
    ln has machine type problem
    """ ## not use symbol link
    return ShellCommand("cp -fr {input} {output}",
                        input=orig,
                        output=dest,
                        name="copy")
Пример #18
0
def merge_bams(workflow, conf):  ## merge input and chip bam
    """
    input multiple input and multiple control to merge into one file separately
    :return:
    """
    # merge all treatments into one
    merge_bams_treat = ShellCommand(
        "{tool} merge {output[merged]} {param[bams]}",
        tool="samtools",
        input=[target + ".bam" for target in conf.treatment_targets],
        output={"merged": conf.prefix + "_treatment.bam"})
    merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)}
    merge_bams_treat.allow_fail = True
    merge_bams_treat.allow_dangling = True

    if len(conf.treatment_targets) > 1:
        attach_back(workflow, merge_bams_treat)
    else:
        # when there's only one treatment sample, use copying instead of merging
        attach_back(
            workflow,
            make_link_command(merge_bams_treat.input[0],
                              merge_bams_treat.output["merged"]))

    # merging step will be skipped if control sample does not exist
    # So be careful to check whether there are control samples before using `_control.bam`
    if len(conf.control_targets) > 1:
        merge_bams_control = merge_bams_treat.clone
        merge_bams_control.input = [
            target + ".bam" for target in conf.control_targets
        ]
        merge_bams_control.output = {"merged": conf.prefix + "_control.bam"}
        merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)}
        attach_back(workflow, merge_bams_control)
    elif len(conf.control_targets) == 1:
        attach_back(
            workflow,
            make_link_command(conf.control_targets[0] + ".bam",
                              conf.prefix + "_control.bam"))
Пример #19
0
def sampling(orig, dest, rand, format, conf): # call fastq_sampling
    """
    prepare sampling fastq files for library contamination and fastqc
    rand: the number of random selected fastq reads
    use lh3's https://github.com/lh3/seqtk/ to sample fastq and fastq.gz
    """
    if format == "fastq":
        #return PythonCommand(fastq_sampling,
        #                     input=orig,
        #                     output=dest,
        #                     param={"random_number": rand})
        ## faster and support fastq.gz
        ## if paired end, we must use same -s
        return ShellCommand("{tool} sample -s 11 {input[fastq]} {param[rand]} > {output[fastq_sample]}",
                            tool = "seqtk",
                            input = orig,
                            output = dest,
                            param = {"rand": 100000})

    elif format == "sam":
        ## samtools sampling
        ## add judge condition
        return ShellCommand("""
                            count=$({tool} view -Sc {input[sam]})
                            ## judge mapped reads number less than sampling number
                            if [ $count -le {param[random_number]} ]
                            then
                                ln -f {input[sam]} {input[sam]}.{param[random_number]}
                                {tool} view -bS {input[sam]}.{param[random_number]} > {output[samp]}
                            else
                                sampling_pe_sam.py {input[sam]} {param[random_number]} {param[pair]}
                                {tool} view -bS {input[sam]}.{param[random_number]} > {output[samp]}
                            fi
                            """,
                            tool = "samtools",
                            input={"sam": orig},
                            output={"samp": dest},
                            param={"random_number": rand, "pair": str(conf.pe)},
                            name = "sampling bam")
Пример #20
0
def fragment(workflow, conf):
    ## this is done after FRiP
    if conf.get("tool", "macs2"):
        macs2_bin = conf.get("tool", "macs2")
    else:
        macs2_bin = "macs2"
    for target in conf.treatment_targets:
        fragment_size = attach_back(
            workflow,
            ShellCommand(
                "{tool} predictd -i {input[bam]} --rfile {param[prefix]} -g {param[species]}",
                tool=macs2_bin,
                input={"bam": target + ".bam"},
                output={"R": target + "_model.R"},
                param={
                    "prefix": target + "_model.R",
                    "species": 'hs'
                }))
        fragment_size.update(param=conf.items("macs2"))
        ## except too few peaks for modeling
        fragment_size.allow_fail = True
        fragment_size.allow_dangling = True

    ## extract standard deviation from MACS2 model.R,
    ## use m, p, and pileup value for standard deviation; mean fragment size is provided (choose the one with highest correlation)
    frag_qc = attach_back(
        workflow,
        PythonCommand(
            stat_frag_std,
            input={
                "r":
                [target + "_model.R" for target in conf.treatment_targets]
            },
            output={
                "json": conf.json_prefix + "_frag.json",
                "r":
                [target + "_frag_sd.R" for target in conf.treatment_targets]
            },
            param={
                "samples": conf.treatment_bases,
                "frag_tool": "BAMSE"
            },
            name="macs2 model R script parser"))
    frag_qc.allow_fail = True
    frag_qc.allow_dangling = True
Пример #21
0
def bedAnnotate_ceas(workflow, conf):
    """
    Calls bedAnnotate to get the genome distribution of the summits
    """
    import os
    summits = conf.prefix + "_sort_summits.bed" if conf.get(
        "macs2", "type") in ["both", "narrow"
                             ] else conf.prefix + "_b_sort_peaks.broadPeak"
    ceas = attach_back(
        workflow,
        ShellCommand(
            """{tool} -g {param[geneTable]} -b {input} -e {output[exon]} -t {output[gene]}> {output[meta]}
            meta_info.sh {output[gene]} {output[exon]} 2000 {param[chrominfo]}
            """,
            tool="bedAnnotate.py",
            input=summits,
            output={
                "meta": conf.prefix + ".meta",
                "gene": os.path.join(conf.target_dir, "gene.bed"),
                "exon": os.path.join(conf.target_dir, "exon.bed"),
                "promoter": os.path.join(conf.target_dir, "gene.bed_promoter"),
                "exon": os.path.join(conf.target_dir, "gene.bed_exon")
            },
            param={
                "geneTable":
                conf.get_path(conf.get("basics", "species"), "geneTable"),
                "chrominfo":
                conf.get_path(conf.get("basics", "species"), "chrom_len")
            },
            name="bedAnnotate (ceas)"))
    try:
        has_velcro = conf.get(conf.get("basics", "species"), "velcro")
        has_dhs = conf.get(conf.get("basics", "species"), "dhs")
    except:
        has_velcro = ""
        has_dhs = ""
    ceas.allow_fail = True
    ceas.allow_dangling = True

    if has_dhs:
        DHS(workflow, conf)
    if has_velcro:
        velcro(workflow, conf)
    stat_bedAnnotate(workflow, conf, has_dhs, has_velcro)
Пример #22
0
def FRiP(workflow, conf):  # FRiP
    """
    Fraction of Reads in Peaks regions at 4M reads level
    For example: 2 treat, 2 control
    modify: without down sampling read peaks calling, use merged peaks for comparison
    """
    ## use merged peaks for evaluation after removing chrM reads
    for t in conf.sample_targets:
        if conf.frip:  ## sampling 5M reads
            reads = t + "_5000000_nochrM.bam"
        else:
            reads = t + "_4000000_nochrM.bam"
        frip = attach_back(
            workflow,
            ShellCommand("""
                                        fr=$(bedtools intersect -f {param[p]} -wa -u -abam {input[reads]} -b {input[peaks]} -bed | wc -l)
                                        total=$(samtools flagstat {input[reads]} | head -1 | cut -d" " -f1)
                                        echo $fr,$total > {output[frip]}
                                        """,
                         tool="intersectBed",
                         input={
                             "reads":
                             reads if conf.down else t + "_nochrM.bam",
                             "peaks":
                             conf.prefix + "_sort_peaks.narrowPeak" if
                             conf.get("macs2", "type") in ["both", "narrow"]
                             else conf.prefix + "_b_sort_peaks.broadPeak"
                         },
                         output={"frip": t + ".frip"},
                         param={"p": "1E-9"},
                         name="FRiP score"))
        ## in case that peaks calling on 4M reads may be very poor,
        ## no peaks generated, allow fail and dangling
        frip.allow_fail = True
        frip.allow_dangling = True
    frip.update(param=conf.items("bedtools"))

    ## QC part
    stat_frip(workflow, conf)
    if conf.long:
        tex_frip(workflow, conf)
Пример #23
0
def bowtie(workflow, conf, target, output, index):  # Mapping
    """
    Use bowtie to map reads to genome,
    """
    bowtie = attach_back(
        workflow,
        ShellCommand(
            "{tool} -p {param[NUM_THREADS]} -S -m 1 {param[index]} {input[fastq]} {output[sam]}",
            tool="bowtie",
            input={"fastq": target + "_100k.fastq"},
            output={"sam": output},
            param={
                "NUM_THREADS": conf.threads,
                ## judge chosen species from basics section
                "index": index
            },
            name="bowtie aln"))
    bowtie.update(param=conf.items("bowtie"))
    bowtie.allow_fail = True
    bowtie.allow_dangling = True
    return workflow
Пример #24
0
def eval_reps(workflow, conf, tex):
    peaks = [ target + ".narrowPeak" for target in conf.treatment_targets ]

    attach_back(workflow, ShellCommand(
        """
        cat {param[narrowPeaks]} | sort -k1,1 -k2,2n - | bedtools merge -i - > {output[mergedPeak]}
        bedToBigBed {output[mergedPeak]} {param[chromsize]} {output[mergedPeakbb]}
        bigWigCorrelate -restrict={output[mergedPeakbb]} {param[bigwigs]} 1>{output[qc1]}
        {tool} {param[narrowPeaksbb]} {output[qc2]}
        """,
        tool = "edwComparePeaks",
        input = {"narrowPeaks": peaks,
                 "bigwigs": [ target + ".bigWig" for target in conf.treatment_targets ],
                 "narrowPeakbbs": [ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]},
        output = {"mergedPeak": conf.prefix + "_merge.bed",
                  "mergedPeakbb": conf.prefix + "_merged.bigBed",
                  "qc1": conf.prefix + "_cor.qc",
                  "qc2": conf.prefix + "_overlap.qc"},
        param = {"narrowPeaksbb": " ".join([ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]),
                 "narrowPeaks": " ".join([ target + ".narrowPeak" for target in conf.treatment_targets ]),
                 "bigwigs": " ".join([ target + ".bigWig" for target in conf.treatment_targets ]),
                 "chromsize": conf.get(conf.species, "chrom_len")}))
Пример #25
0
def star(workflow, conf, target, output, index):  # Mapping
    """
    Use star to map reads to genome,
    """
    star = attach_back(
        workflow,
        ShellCommand(
            "{tool} --genomeDir {param[index]} --runThreadN {param[NUM_THREADS]} --readFilesIn {input[fastq]} --outFileNamePrefix {param[prefix]}",
            tool="STAR",
            input={"fastq": target + "_100k.fastq"},
            output={"sam": output},
            param={
                "NUM_THREADS": conf.threads,
                "prefix": target,
                ## judge chosen species from basics section
                "index": index
            },
            name="star aln"))
    star.update(param=conf.items("bowtie"))
    star.allow_dangling = True
    star.allow_fail = True
    return workflow
Пример #26
0
def replicates_bw_correlation(workflow,
                              conf):  ## correlation among different replicates
    """
    Use UCSC binary bigWigCorrelate to calculate reads density correlation
    collections in json files from qc
    :param workflow: samflow class
    :param conf: parsed config files
    :return: void
    """
    replicates_correlation = attach_back(
        workflow,
        ShellCommand(
            "{tool} {param[input_list]} > {output}",
            tool="wigCorrelate",
            input=[target + "_treat.bw" for target in conf.treatment_targets],
            output=conf.prefix + ".cor",
            param={"input_list": []},
            name="correlation between bigwiggle"))
    replicates_correlation.update(
        param={"input_list": " ".join(replicates_correlation.input)})
    replicates_correlation.allow_fail = True  # in case 0 peak in macs2
    replicates_correlation.allow_dangling = True
Пример #27
0
def render_pdf(workflow, conf, long=True):
    latex_environ(workflow, conf)
    summary_table_latex(workflow, conf)
    merge_latex(workflow, conf)
    render = attach_back(
        workflow,
        ShellCommand(
            # Somehow the pdflatex has to be invoked twice..
            "{tool} -output-directory {output[dir]} -jobname={param[name]} {input} \
                    && {tool} -output-directory {output[dir]} -jobname={param[name]} {input}",
            tool="pdflatex",
            input=conf.prefix + "_report.tex",
            # output[pdf] should use "conf.prefix" to have the absolute path
            output={
                "dir": conf.target_dir,
                "pdf": conf.prefix + "_report.pdf"
            },
            # param[name] should use "conf.id" to avoid using absolute path
            param={"name": conf.id + "_report"},
            name="report"))

    render.allow_fail = True
Пример #28
0
def stat_bwa(workflow, conf): ## use samtools to parse mappable reads from bwa
    """
    bam files are filtered by samtools -q 1, so mapped reads are considered to be unique
    """
    for t in conf.sample_targets:
        stat = attach_back(workflow, ShellCommand(
        """
        {tool} view -Sc {input[sam]} > {output[total]}
        {tool} flagstat {input[bam]} > {output[stat]}
        """,
        tool = "samtools",
        input = {"bam": t + ".bam",
                 "sam": t + ".sam"},
        output = {"stat": t + "_mapped.bwa",
                  "total": t + "_total.bwa"}))
        stat.allow_fail = True
        stat.allow_dangling = True
    collect = attach_back(workflow, PythonCommand(json_bwa,
        input={"bwa_mapped": [ t + "_mapped.bwa" for t in conf.sample_targets ],
               "bwa_total": [ t + "_total.bwa" for t in conf.sample_targets ]},
        output={"json": conf.json_prefix+"_map.json"},
        param={"sample":conf.sample_bases},
        name="bwa qc"))
    collect.allow_dangling = True
    collect.allow_fail = True

    if conf.long:
        long_collect = attach_back(workflow, PythonCommand(bwa_figures,
                                            input = {"dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"),
                                                     "json": conf.json_prefix + "_map.json",
                                                     "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R")},


                                            output = {"pdf": conf.prefix + "_bwa_compare.pdf", "R": conf.prefix+"_bwa_compare.R"},
                                            param = {"sample": conf.sample_bases}))
        long_collect.allow_fail = True
        long_collect.allow_fail = True
Пример #29
0
def velcro(workflow, conf):
    vel = attach_back(
        workflow,
        ShellCommand(
            """
                    n=$(head -n {param[p]} {input[MACS2_bed]} | wc -l)
                    velcro=$(head -n {param[p]} {input[MACS2_bed]} | {tool} -wa -u -a - -b {input[velcro_peaks_bed]} | wc -l)
                    velcro=$(echo \"scale=5;$velcro/$n\" | bc)
                    echo $velcro > {output}
                    """,
            tool="intersectBed",
            input={
                "MACS2_bed":
                conf.prefix + "_sort_peaks.narrowPeak" if conf.get(
                    "macs2", "type") in ["both", "narrow"] else conf.prefix +
                "_b_sort_peaks.broadPeak",
                "velcro_peaks_bed":
                conf.get(conf.get("basics", "species"), "velcro")
            },
            output=conf.prefix + ".velcro",
            param={"p": 5000},
            name="velcro overlap"))
    vel.allow_fail = True
    vel.allow_dangling = True
Пример #30
0
def _macs2(workflow, conf):
    # merge all treatments into one
    merge_bams_treat = ShellCommand(
        "{tool} merge {output[merged]} {param[bams]}",
        tool="samtools",
        input=[target + ".bam" for target in conf.treatment_targets],
        output={"merged": conf.prefix + "_treatment.bam"})
    merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)}

    if len(conf.treatment_targets) > 1:
        attach_back(workflow, merge_bams_treat)
    else:
        # when there's only one treatment sample, use copying instead of merging
        attach_back(workflow, make_copy_command(merge_bams_treat.input[0], merge_bams_treat.output["merged"]))

    # merging step will be skipped if control sample does not exist
    # So be careful to check whether there are control samples before using `_control.bam`
    if len(conf.control_targets) > 1:
        merge_bams_control = merge_bams_treat.clone
        merge_bams_control.input = [target + ".bam" for target in conf.control_targets]
        merge_bams_control.output = {"merged": conf.prefix + "_control.bam"}
        merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)}
        attach_back(workflow, merge_bams_control)
    elif len(conf.control_targets) == 1:
        attach_back(workflow, make_copy_command(conf.control_targets[0] + ".bam", conf.prefix + "_control.bam"))

    macs2_on_merged = attach_back(workflow, ShellCommand(
        "{tool} callpeak -B -q 0.01 --keep-dup {param[keep_dup]} --shiftsize={param[shiftsize]} --nomodel \
        {param[treat_opt]} {param[control_opt]} -n {param[description]}",
        tool="macs2",
        input={"treat": conf.prefix + "_treatment.bam"},
        output={"peaks": conf.prefix + "_peaks.bed",
                "summit": conf.prefix + "_summits.bed",
                "treat_bdg": conf.prefix + "_treat_pileup.bdg",
                "ENCODE": conf.prefix + "_peaks.encodePeak",
                "peaks_xls": conf.prefix + "_peaks.xls",
                "control_bdg": conf.prefix + "_control_lambda.bdg"},
        param={"description": conf.prefix,
               "keep_dup": 1,
               "shiftsize": 73},
        name="macs2_callpeak_merged"))
    macs2_on_merged.param["treat_opt"] = "-t " + macs2_on_merged.input["treat"]

    # control option is skipped if control samples does not exist
    if len(conf.control_targets) >= 1:
        macs2_on_merged.input["control"] = conf.prefix + "_control.bam"
        macs2_on_merged.param["control_opt"] = "-c " + macs2_on_merged.input["control"]
    else:
        macs2_on_merged.param["control_opt"] = ""

    macs2_on_merged.update(param=conf.items("macs2"))


    # For bedGraphToBigwiggle bugs, we need to remove coordinates over-border coordinates
    # As _control_lambda.bdg always exist. There are no need to check whether there are control samples.
    bdg_trim_control = attach_back(workflow,
        ShellCommand(
            '{tool} intersect -a {input[bdg]} -b {input[chrom_bed]} -wa -f 1.00 > {output}',
            tool="bedtools",
            input={"bdg": conf.prefix + "_control_lambda.bdg",
                   'chrom_bed': conf.get_path("lib", "chrom_bed")},
            output=conf.prefix + "_control_lambda.bdg.tmp",
            name="bedGraph filtering"))

    bdg_trim_treat = bdg_trim_control.clone
    bdg_trim_treat.input["bdg"] = conf.prefix + "_treat_pileup.bdg"
    bdg_trim_treat.output = conf.prefix + "_treat_pileup.bdg.tmp"
    attach_back(workflow, bdg_trim_treat)

    bdg2bw_treat = attach_back(workflow,
        ShellCommand(
            "{tool} {input[bdg]} {input[chrom_len]} {output[bw]}",
            tool="bedGraphToBigWig",
            input={"bdg": conf.prefix + "_control_lambda.bdg.tmp",
                   "chrom_len": conf.get("lib", "chrom_len")},
            output={"bw": conf.prefix + "_control.bw"},
            name="bdg_to_bw"))

    # prototype used here to do the similar thing on treatment file
    bdg2bw_control = bdg2bw_treat.clone
    bdg2bw_control.input["bdg"] = conf.prefix + "_treat_pileup.bdg.tmp"
    bdg2bw_control.output["bw"] = conf.prefix + "_treat.bw"
    attach_back(workflow, bdg2bw_control)

    attach_back(workflow, PythonCommand(
        stat_macs2,
        input={"macs2_peaks_xls": conf.prefix + "_peaks.xls",
               "db": ChiLinQC_db,
               "template": rlang_template},
        output={"json": conf.json_prefix + "_macs2.json",
                "R": conf.prefix + "_macs2.R",
                "pdf": conf.prefix + "_macs2.pdf"},
        param={"id": conf.id},
        name="MACS2 summary"))
Пример #31
0
 def test_invoke_collect_output(self):
     echo_cmd = ShellCommand("echo test_collect").set_stdout_collecting()
     self.assertTrue(echo_cmd.invoke())
     self.assertEqual(echo_cmd.result, "test_collect\n")
Пример #32
0
 def test_invoke_non_exist_input(self):
     non_exist_input_cmd = ShellCommand("cat < {input}", input="non_exist_file")
     self.assertFalse(non_exist_input_cmd.invoke())
Пример #33
0
def macs2_rep(workflow, conf):
    # Though macs command already exists, I choose not to use prototype here
    # Because the prototype definition and usage might be far from each other, making codes not readable
    if conf.get("tool", "macs2"):
        macs2_bin = conf.get("tool", "macs2")
    else:
        macs2_bin = "macs2"

    format = " -f BAMPE " if conf.pe  else " "

    for target in conf.treatment_targets:
        ## DNase, H3K4, H2AZ, all acetyl marks, or TF
        if conf.get("macs2", "type").lower() in ["both", "narrow"]: ## for DNase, H3K4, H2AZ, all acetyl marks, or TF
            macs2_on_rep_narrow = attach_back(workflow,
                                              ShellCommand(
                                                  """
                                                  {tool} callpeak --SPMR -B -q {param[fdr]} --keep-dup {param[keep_dup]} --extsize={param[extsize]} --nomodel -g {param[species]} {param[format]} {param[treat_opt]} {param[control_opt]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]}
                                                  ## remove weird path characters
                                                  cp {output[peaks]} {output[peakstmp]}
                                                  cp {output[summits]} {output[summitstmp]}
                                                  awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]}
                                                  awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]}
                                                  awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[summitstmp]} > {output[summits]}
                                                  """,
                                                  tool=macs2_bin,
                                                  input={"treat": target + ".bam"},
                                                  output={"peaks": target + "_peaks.narrowPeak",
                                                          "peakstmp": target + "_peaks.narrowPeak.tmp",
                                                          "summits": target + "_summits.bed",
                                                          "summitstmp": target + "_summits.bed.tmp",
                                                          "bed": target + "_peaks.bed",
                                                          "bedtmp": target + "_peaks.bed.tmp",
                                                          "treat_bdg": target + "_treat_pileup.bdg",
                                                          "peaks_xls": target + "_peaks.xls",
                                                          "control_bdg": target + "_control_lambda.bdg"},
                                                  param={"description": target, "keep_dup": 1, "extsize": 73*2, "species": "hs", "fdr":0.01, "format": format},
                                                  name="macs2_callpeak_rep"))
            macs2_on_rep_narrow.param["treat_opt"] = "-t " + macs2_on_rep_narrow.input["treat"]

            sort = attach_back(workflow,
                               ShellCommand(
                                   "{tool} -r -g -k 9 {input} > {output}",
                                   tool = "sort",
                                   input = target + "_peaks.narrowPeak",
                                   output = target + "_sort_peaks.narrowPeak"))

            # control option is skipped if control samples does not exist
            if len(conf.control_targets) >= 1:
                macs2_on_rep_narrow.input["control"] = conf.prefix + "_control.bam"
                macs2_on_rep_narrow.param["control_opt"] = "-c " + macs2_on_rep_narrow.input["control"]

            else:
                macs2_on_rep_narrow.param["control_opt"] = ""
            macs2_on_rep_narrow.update(param=conf.items("macs2"))
            macs2_on_rep_narrow.allow_dangling = True
            macs2_on_rep_narrow.allow_fail = True


        if conf.get("macs2", "type").lower() in ["both", "broad"]:  # K9, K36, K79 and K27 methylation, both for chromatin regulator, all other histone marks
            macs2_on_rep_broad = attach_back(workflow,
                                             ShellCommand(
                                                 """
                                                 {tool} callpeak --SPMR -B -q {param[fdr]} {param[treat_opt]} {param[control_opt]} --keep-dup {param[keep_dup]} --broad --broad-cutoff {param[fdr]} -g {param[species]} {param[format]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]}
                                                 ## remove weird path characters
                                                 cp {output[peaks]} {output[peakstmp]}
                                                 awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]}
                                                 awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]}
                                                 """,
                                                 tool=macs2_bin,
                                                 input = {"treat": target + ".bam"},
                                                 output = {"peaks": target + "_b_peaks.broadPeak",
                                                           "peakstmp": target + "_b_peaks.broadPeak.tmp",
                                                           "bed": target + "_b_peaks.bed",
                                                           "bedtmp": target + "_b_peaks.bed.tmp",
                                                           "treat_bdg": target + "_b_treat_pileup.bdg",
                                                           "peaks_xls": target + "_b_peaks.xls",
                                                           "control_bdg": target + "_b_control_lambda.bdg"},
                                                 param = {"description": target + "_b",
                                                          "species": "hs",
                                                          "format": format,
                                                          "fdr": 0.01},
                                                 name = " macs2 broad peaks"))
            macs2_on_rep_broad.param["treat_opt"] = " -t " + macs2_on_rep_broad.input["treat"]
            macs2_on_rep_broad.update(param=conf.items("macs2"))
            macs2_on_rep_broad.allow_dangling = True
            macs2_on_rep_broad.allow_fail=True

            if len(conf.control_targets) >= 1:
                macs2_on_rep_broad.input["control"] = conf.prefix + "_control.bam"
                macs2_on_rep_broad.param["control_opt"] = "-c " + macs2_on_rep_broad.input["control"]
            else:
                macs2_on_rep_broad.param["control_opt"] = ""
            ## some broad peaks cannot be called
            macs2_on_rep_broad.update(param=conf.items("macs2"))

            sort = attach_back(workflow,
                               ShellCommand(
                                   "{tool} -r -g -k 9 {input} > {output}",
                                   tool = "sort",
                                   input = target + "_b_peaks.broadPeak",
                                   output = target + "_b_sort_peaks.broadPeak",
                                   name = "sort broad peaks"))
            sort.allow_dangling = True
            sort.allow_fail=True

        ## For bedGraphToBigwiggle bugs, we need to remove coordinates outlier
        if conf.get("macs2", "type").lower() in ["both", "broad"]:
            cont_bdg = target + "_b_control_lambda.bdg"
            treat_bdg = target + "_b_treat_pileup.bdg"
        if conf.get("macs2", "type").lower() in ["both", "narrow"]:
            cont_bdg = target + "_control_lambda.bdg"
            treat_bdg = target + "_treat_pileup.bdg"
        import os
        bdg_trim_controlrep = attach_back(workflow,
                                          ShellCommand(
                                              '{tool} intersect -a {input} -b {param[chrom_bed]} -wa -f 1.00 > {output}',
                                              tool="bedtools",
                                              input=cont_bdg,
                                              output=cont_bdg + ".tmp",
                                              param={"chrom_bed": os.path.join(conf.target_dir, "chrom.bed")},
                                              name="bedGraph control replicate filtering"))

        bdg_trim_controlrep.allow_dangling = True
        bdg_trim_controlrep.allow_fail=True

        bdg_trim_treatrep = bdg_trim_controlrep.clone
        bdg_trim_treatrep.input = treat_bdg
        bdg_trim_treatrep.output = treat_bdg + ".tmp"

        bdg_trim_treatrep.allow_dangling = True
        bdg_trim_treatrep.allow_fail=True

        attach_back(workflow, bdg_trim_treatrep)

        bdg2bw_treatrep = attach_back(workflow,
                                      ShellCommand(
                                          "{tool} {input} {param[chrom_len]} {output}",
                                          tool="bedGraphToBigWig",
                                          input=treat_bdg+".tmp",
                                          output=target + "_treat.bw",
                                          param={"chrom_len": conf.get_path(conf.get("basics", "species"), "chrom_len")},
                                          name="bdg_to_bw treat"))
        ## in case broad peaks calling failed
        bdg2bw_treatrep.allow_dangling = True
        bdg2bw_treatrep.allow_fail=True

        # prototype used here to do the similar thing on treatment file
        bdg2bw_controlrep = bdg2bw_treatrep.clone
        bdg2bw_controlrep.input = cont_bdg + ".tmp"
        bdg2bw_controlrep.output = target + "_control.bw"
        attach_back(workflow, bdg2bw_controlrep)

        ## in case broad peaks calling failed
        bdg2bw_controlrep.allow_dangling = True
        bdg2bw_controlrep.allow_fail=True

    stat_macs2_on_rep(workflow, conf)
Пример #34
0
def macs2(workflow, conf):
    if conf.get("tool", "macs2"):
        macs2_bin = conf.get("tool", "macs2")
    else:
        macs2_bin = "macs2"

    format = "-f BAMPE" if conf.pe else " "

    if conf.get("macs2", "type").lower() in ["both", "narrow"]: ## for DNase, H3K4, H2AZ, all acetyl marks, or TF
        macs2_on_merged_narrow = attach_back(workflow, ShellCommand(
            """
            {tool} callpeak --SPMR -B -q {param[fdr]} --keep-dup {param[keep_dup]} --extsize={param[extsize]} --nomodel -g {param[species]} {param[format]} {param[treat_opt]} {param[control_opt]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]}
            ## remove weird path characters
            cp {output[peaks]} {output[peakstmp]}
            cp {output[summits]} {output[summitstmp]}
            awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]}
            awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]}
            awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[summitstmp]} > {output[summits]}
            """,
            tool=macs2_bin,
            input={"treat": conf.prefix + "_treatment.bam"},
            output={"peaks": conf.prefix + "_peaks.narrowPeak",
                    "peakstmp": conf.prefix + "_peaks.narrowPeak.tmp",
                    "bed": conf.prefix + "_peaks.bed",
                    "bedtmp": conf.prefix + "_peaks.bed.tmp",
                    "summits": conf.prefix + "_summits.bed",
                    "summitstmp": conf.prefix + "_summits.bed.tmp",
                    "treat_bdg": conf.prefix + "_treat_pileup.bdg",
                    "peaks_xls": conf.prefix + "_peaks.xls",
                    "control_bdg": conf.prefix + "_control_lambda.bdg"},
            param={"description": conf.prefix,
                   "keep_dup": 1,
                   "format": format,
                   "extsize": 73 * 2, # extsize=2*shiftsize
                   "fdr": 0.01,
                   "species": "hs"},
            name="macs2_callpeak_merged"))

        macs2_on_merged_narrow.param["treat_opt"] = "-t " + macs2_on_merged_narrow.input["treat"]
        # control option is skipped if control samples does not exist
        if len(conf.control_targets) >= 1:
            macs2_on_merged_narrow.input["control"] = conf.prefix + "_control.bam"
            macs2_on_merged_narrow.param["control_opt"] = "-c " + macs2_on_merged_narrow.input["control"]
        else:
            macs2_on_merged_narrow.param["control_opt"] = ""

        macs2_on_merged_narrow.update(param=conf.items("macs2"))

        sort = attach_back(workflow,
                           ShellCommand(
                               """{tool} -r -g -k 9 {input[peaks]} > {output[p]}
                               {tool} -r -g -k 5 {input[summits]} > {output[s]}
                               """,
                               tool = "sort",
                               input = {"peaks": conf.prefix + "_peaks.narrowPeak",
                                        "summits": conf.prefix + "_summits.bed"},
                               output = {"p":conf.prefix + "_sort_peaks.narrowPeak",
                                         "s":conf.prefix + "_sort_summits.bed"},
                               name = "sort peaks"))
        macs2_on_merged_narrow.allow_fail = True
        macs2_on_merged_narrow.allow_dangling = True
        sort.allow_fail = True
        sort.allow_dangling = True

    if conf.get("macs2", "type").lower() in ["both", "broad"]:  # K9, K36, K79 and K27 methylation, both for chromatin regulator, all other histone marks
        macs2_on_merged_broad = attach_back(workflow,
                                            ShellCommand(
                                                """
                                                {tool} callpeak --SPMR -B -q {param[fdr]} {param[treat_opt]} {param[control_opt]} --keep-dup {param[keep_dup]} --broad --broad-cutoff {param[fdr]} -g {param[species]} {param[format]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]}
                                                ## remove weird path characters
                                                cp {output[peaks]} {output[peakstmp]}
                                                awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]}
                                                awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]}
                                                """,
                                                tool=macs2_bin,
                                                input = {"treat": conf.prefix + "_treatment.bam"},
                                                output = {"peaks": conf.prefix + "_b_peaks.broadPeak",
                                                          "peakstmp": conf.prefix + "_b_peaks.broadPeak.tmp",
                                                          "bed": conf.prefix + "_b_peaks.bed",
                                                          "bedtmp": conf.prefix + "_b_peaks.bed.tmp",
                                                          "treat_bdg": conf.prefix + "_b_treat_pileup.bdg",
                                                          "peaks_xls": conf.prefix + "_b_peaks.xls",
                                                          "control_bdg": conf.prefix + "_b_control_lambda.bdg"},
                                                param = {"description": conf.prefix + "_b",
                                                         "species": "hs",
                                                         "format": format,
                                                         "keep_dup": 1, "fdr": 0.01},
                                                name = "broad peaks calling"))
        macs2_on_merged_broad.param["treat_opt"] = " -t " + macs2_on_merged_broad.input["treat"]
        macs2_on_merged_broad.allow_fail = True
        macs2_on_merged_broad.allow_dangling = True

        if len(conf.control_targets) >= 1:
            macs2_on_merged_broad.input["control"] = conf.prefix + "_control.bam"
            macs2_on_merged_broad.param["control_opt"] = "-c " + macs2_on_merged_broad.input["control"]
        else:
            macs2_on_merged_broad.param["control_opt"] = ""
        macs2_on_merged_broad.update(param=conf.items("macs2"))

        sort = attach_back(workflow,
                    ShellCommand(
                        """{tool} -r -g -k 9 {input[peaks]} > {output[p]}
                        """,
                        tool = "sort",
                        input = {"peaks": conf.prefix + "_b_peaks.broadPeak"},
                        output = {"p": conf.prefix + "_b_sort_peaks.broadPeak"},
                        name = "sort peaks files"))
        sort.allow_dangling = True
        sort.allow_fail = True

    # For bedGraphToBigwiggle bugs, we need to remove coordinates over-border coordinates
    # As _control_lambda.bdg always exist. There are no need to check whether there are control samples.

    if conf.get("macs2", "type").lower() in ["both", "broad"]:
        cont_bdg = conf.prefix + "_b_control_lambda.bdg"
        treat_bdg = conf.prefix + "_b_treat_pileup.bdg"
    if conf.get("macs2", "type").lower() in ["both", "narrow"]:
        cont_bdg = conf.prefix + "_control_lambda.bdg"
        treat_bdg = conf.prefix + "_treat_pileup.bdg"
    import os
    bdg_trim_control = attach_back(workflow,
                                   ShellCommand(
                                       '{tool} intersect -a {input[bdg]} -b {param[chrom_bed]} -wa -f 1.00 > {output}',
                                       tool="bedtools",
                                       input={"bdg": cont_bdg},
                                       param = {"chrom_bed": os.path.join(conf.target_dir, "chrom.bed")},
                                       output=cont_bdg+".tmp",
                                       name="bedGraph filtering control"))
    bdg_trim_control.fail = True
    bdg_trim_control.allow_dangling = True

    bdg_trim_treat = bdg_trim_control.clone
    bdg_trim_treat.input["bdg"] = treat_bdg
    bdg_trim_treat.output = treat_bdg + ".tmp"
    bdg_trim_treat.fail = True
    bdg_trim_treat.allow_dangling = True

    attach_back(workflow, bdg_trim_treat)

    bdg2bw_treat = attach_back(workflow,
                               ShellCommand(
                                   "{tool} {input[bdg]} {input[chrom_len]} {output[bw]}",
                                   tool="bedGraphToBigWig",
                                   input={"bdg": cont_bdg+".tmp",
                                          "chrom_len": conf.get_path(conf.get("basics", "species"), "chrom_len")},
                                   output={"bw": conf.prefix + "_control.bw"},
                                   name="bdg_to_bw control"))
    ## in case broad peaks failed
    bdg2bw_treat.allow_fail = True
    bdg2bw_treat.allow_dangling = True

    # prototype used here to do the similar thing on treatment file
    bdg2bw_control = bdg2bw_treat.clone
    bdg2bw_control.input["bdg"] = treat_bdg+".tmp"
    bdg2bw_control.output["bw"] = conf.prefix + "_treat.bw"

    ## in case broad peaks failed
    bdg2bw_control.allow_fail = True
    bdg2bw_control.allow_dangling = True

    attach_back(workflow, bdg2bw_control)

    stat_macs2(workflow, conf)
Пример #35
0
 def test_invoke_non_exist_output(self):
     non_exist_output_cmd = ShellCommand("echo tempfile3", output="tempfile3")
     self.assertFalse(non_exist_output_cmd.invoke())
Пример #36
0
 def test_invoke_dangling_tool(self):
     dangling_tool_command = ShellCommand("{tool} fun", tool="wolfyp")
     self.assertFalse(dangling_tool_command.invoke())