def _star_sam2bam(workflow, conf): # SAM -> BAM """ convert SAM to BAM and use mapping quality as cutoff :param workflow: samflow defined class :param conf: parsed config file :return: void """ import os for target in conf.sample_targets: sam2bam = attach_back( workflow, ShellCommand(""" ln -s {input[sam]} {output[sam]} {tool} view -q 255 -bt {param[genome]} {input[sam]} -o {output[bam]} """, tool="samtools", input={"sam": target + "Aligned.out.sam"}, output={ "bam": target + ".bam", "sam": target + ".sam" }, param={ "genome": conf.get(conf.get("basics", "species"), "chrom_len"), }, name="star sam2dam")) workflow.update(param=conf.items("sam2bam")) #From bwa/dc.py sam2bamnochrm = attach_back( workflow, ## use mapping quality 1 defined by samtools official FAQ ShellCommand( """ awk \'BEGIN{{OFS="\\t"}} {{print $1,0,$2}}\' {param[genome]} > {param[chrom_bed]} grep -v chrM {param[chrom_bed]} > {output[nochrmbed]} {tool} view -h -b -L {output[nochrmbed]} {input[bam]} > {output[nochrmbam]} {tool} view -h {output[nochrmbam]} > {output[nochrmsam]} {tool} view -h {input[bam]} > {output[usam]} """, tool="samtools", input={"bam": target + ".bam"}, output={ "nochrmbed": target + ".nochrM", "nochrmbam": target + "_nochrM.bam", "usam": target + "_u.sam", ## uniquely mapping sam for sampling "nochrmsam": target + "_nochrM.sam" }, param={ "tmp_bam": target + ".tmp.bam", "output_prefix": target, "chrom_bed": os.path.join(conf.target_dir, "chrom.bed"), "mapq": 1, "genome": conf.get(conf.get("basics", "species"), "chrom_len") }, name="filtering mapping and convert") ) # Use 5G memory as default sam2bamnochrm.update(param=conf.items("sam2bam"))
def merge_bams(workflow, conf): ## merge input and chip bam """ input multiple input and multiple control to merge into one file separately :return: """ # merge all treatments into one merge_bams_treat = ShellCommand( "{tool} merge {output[merged]} {param[bams]}", tool="samtools", input=[target + ".bam" for target in conf.treatment_targets], output={"merged": conf.prefix + "_treatment.bam"}) merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)} merge_bams_treat.allow_fail = True merge_bams_treat.allow_dangling = True if len(conf.treatment_targets) > 1: attach_back(workflow, merge_bams_treat) else: # when there's only one treatment sample, use copying instead of merging attach_back(workflow, make_link_command(merge_bams_treat.input[0], merge_bams_treat.output["merged"])) # merging step will be skipped if control sample does not exist # So be careful to check whether there are control samples before using `_control.bam` if len(conf.control_targets) > 1: merge_bams_control = merge_bams_treat.clone merge_bams_control.input = [target + ".bam" for target in conf.control_targets] merge_bams_control.output = {"merged": conf.prefix + "_control.bam"} merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)} attach_back(workflow, merge_bams_control) elif len(conf.control_targets) == 1: attach_back(workflow, make_link_command(conf.control_targets[0] + ".bam", conf.prefix + "_control.bam"))
def sample_bam_stat(workflow, conf, tex): """ sample non chrm bam to 15M for NSC and PBC sample non chrm bam to 5M for spot """ for i, target in enumerate(conf.treatment_targets): ## for PE, use name sorted in order to calculate PBC input_bam = target + "_name_sorted.bam" if conf.pe else target + "_final_nochrm.bam" attach_back(workflow, ShellCommand( "{tool} {input[namesorted]} {param[run_spp]} {output[bamstat]} {output[sppstat]} {param[pe]} {output[pbc]}", tool = "eap_dnase_stats", input = {"namesorted": input_bam}, output = {"bamstat": target + "_bam_stat.qc", ## 15M "sppstat": target + "_spp.qc", "pbc": target + "_final_nochrm_15M_pbc.qc"}, param = {"pe": "pe" if conf.pe else "se", "run_spp": conf.get("tool", "spp")})) if not "macs" in conf.get("tool", "peak_calling"): attach_back(workflow, ShellCommand( "{tool} {input[bamwithoutchrm]} {param[genome]} {param[readsize]} {output[spot]} {param[hotspot_dir]} {param[hotspot_output]} {param[hotspot_tmp]} {param[spot_tmp]}", tool = "dac_spot", ## 5M input = {"bamwithoutchrm": target + "_final_nochrm.bam"}, output = {"spot": target + "_spot_nochrm_5M.qc"}, param = {"genome": conf.species, "spot_tmp": conf.hotspot_reps_tmp_prefix[i] + "_final_nochrm.bam.5000000.spot.out", "readsize": conf.readsize, "hotspot_dir": conf.get("tool", "peak_calling"), "hotspot_output": target + "_hotspot", "hotspot_tmp": target + "_hotspot_tmp"}))
def read_quality(workflow, conf, tex): if conf.pe: for raw, target in conf.treatment_pairs_pe: attach_back( workflow, ShellCommand( "{tool} {input[fastq][0]} {input[fastq][1]} {output[stat][0]} {output[stat][1]}", tool="dac_pe_read_quality", input={"fastq": raw}, output={"stat": [i + "_read_quality.qc" for i in target]})) # attach_back(workflow, PythonCommand(stat_fastqStat, # input = {"seq": [ [ p + "_100k.seq" for p in target ] for target in conf.treatment_pair_data ]}, # output = {"json": conf.json_prefix + "_seq_quality.json"}, # param = {"samples": conf.treatment_bases, "seq_type": conf.pe})) # attach_back(workflow, PythonCommand( # seq_quality_doc, # input = {"tex": tex, "json": conf.json_prefix + "_seq_quality.json"}, # output = {"seq": conf.latex_prefix + "seq_quality.tex", "len": conf.latex_prefix + "len.tex"}, # param = {"seq_type": conf.seq_type, "reps": len(conf.treatment_pairs), # "pe_samples": conf.treatment_bases})) else: for raw, target in conf.treatment_pairs: sample_fq = {"stat": target + "_read_quality.qc"} attach_back( workflow, ShellCommand( "{tool} {input} {output[stat]}", tool="dac_se_read_quality", input=raw, output=sample_fq, name="100k read sequence quality and sequence length"))
def _bwa(workflow, conf): """ incorpate ENCODE ChIP-seq alignment parameters """ for raw, target in conf.treatment_pairs: param = {"threads": conf.threads, "index":conf.get(conf.species, "genome_index"), "prefix": target + "_raw_sorted", "qc2": target + "_rawbam_stats.qc"} if conf.pe: bwa = attach_back(workflow, ShellCommand( "{tool} {param[threads]} {param[index]} {input[fastq][0]} {input[fastq][1]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}", tool = "eap_run_bwa_pe", input = {"fastq": raw}, output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"}, param = param, name = "pair end mapping")) else: bwa = attach_back(workflow, ShellCommand( "{tool} {param[threads]} {param[index]} {input[fastq]} {output[bam]} {output[qc]} {param[prefix]} {param[qc2]}", tool = "eap_run_bwa_se", input = {"fastq": raw}, output = {"bam": target + "_raw_sorted.bam", "qc": target + "_rawbam.qc"}, param = param, name = "single end mapping")) bwa.update(param = conf.items("bwa"))
def filter_bam(workflow, conf, tex): """ filter bam file by samtools and sample by ucsc app """ for target in conf.treatment_targets: input = {"raw": target + "_raw_sorted.bam"} if conf.pe: name = "pair" tool = "dac_bam_pe_post_filter" param = { "mapq": 3, "namesortedbamprefix": target + "_name_sorted", "finalprefix": target + "_final", "qc2": target + "_filter_bam_stats.qc" } output = { "finalbam": target + "_final.bam", "namesortedbam": target + "_name_sorted.bam", "bamwithoutchrm": target + "_final_nochrm.bam", "qc": target + "_filter_bam.qc" } attach_back( workflow, ShellCommand( "{tool} {input[raw]} {param[namesortedbamprefix]} {output[namesortedbam]} {param[finalprefix]} {output[finalbam]} {param[mapq]} {output[bamwithoutchrm]} {output[qc]} {param[qc2]}", tool=tool, input=input, output=output, param=param, name="%s end filtering" % name)) else: name = "single" tool = "dac_bam_se_post_filter" param = { "mapq": 3, "finalprefix": target + "_final", "qc2": target + "_filter_bam_stats.qc" } output = { "finalbam": target + "_final.bam", "bamwithoutchrm": target + "_final_nochrm.bam", "qc": target + "_filter_bam.qc" } attach_back( workflow, ShellCommand( "{tool} {input[raw]} {output[finalbam]} {param[mapq]} {output[qc]} {output[bamwithoutchrm]} {param[finalprefix]} {param[qc2]}", tool=tool, input=input, output=output, param=param, name="%s end filtering" % name))
def bowtie(workflow, conf): # Mapping """ Use bowtie to map reads to genome, call __bwa_sam2bam to convert sam to bam :param workflow: samflow defined class :param conf: parsed config files :return: void """ for target in conf.sample_targets: bowtie = attach_back(workflow, ShellCommand( "{tool} -p {param[NUM_THREADS]} -S -m 1 {param[index]} {input[fastq]} {output[sam]}", tool = "bowtie", input = {"fastq": target + ".fastq"}, output = {"sam": target + ".sam"}, param = {"NUM_THREADS": conf.threads, ## judge chosen species from basics section "index": conf.get_path(conf.get("basics", "species"), "genome_index")}, name = "bowtie aln")) bowtie.update(param = conf.items("bowtie")) bowtie.allow_dangling = True bowtie.allow_fail = True _bowtie_sam2bam(workflow, conf) ## QC part--NOTE keeping the bwa legacy code! stat_bwa(workflow, conf) if conf.long: tex_bwa(workflow, conf)
def merge_latex(workflow, conf): ## begin and end of the docs latex_order = [ "_begin.tex", "_summary_table.tex", ] if conf.long: latex_order += [ "_fastqc.tex", "_fastqc_gc.tex", "_map.tex", "_conserv.tex", # "_macs2.latex", "_macs2_on_sample.latex", # "_phan.tex", "_motif.tex", "_contam.tex", "_frip.tex", ] latex_order.append("_end.tex") latex_list = [conf.latex_prefix + i for i in latex_order] merge_cmd = attach_back( workflow, ShellCommand("cat {param[tex]} > {output}", output=conf.prefix + "_report.tex")) merge_cmd.allow_fail = True merge_cmd.param = {"tex": " ".join(latex_list)}
def replicates_peaks_overlap(workflow, conf): # peaks bed from each replicate """ :param workflow: class from samflow :param conf: external parsed config file :return: workflow through attach_back """ for i in range(len(conf.treatment_targets)): for j in range(i + 1, len(conf.treatment_targets)): replicates_overlap = attach_back( workflow, ShellCommand( "{tool} -f {param[p]} -a {input[0]} -b {input[1]} | wc -l > {output}", tool="intersectBed", input=[ conf.treatment_targets[i] + "_sort_peaks.narrowPeak" if conf.get("macs2", "type").lower() in ["both", "narrow"] else conf.treatment_targets[i] + "_b_sort_peaks.broadPeak", conf.treatment_targets[j] + "_sort_peaks.narrowPeak" if conf.get( "macs2", "type").lower() in ["both", "narrow"] else conf.treatment_targets[j] + "_b_sort_peaks.broadPeak" ], output=conf.prefix + "_%s_%s.overlap" % (i, j), param={"p": 0.3}, name="Replicates peaks overlap QC")) replicates_overlap.allow_fail = True # in case 0 peak in macs2 replicates_overlap.allow_dangling = True ## generate a barplot for meta distribution replicates_overlap.update(param=conf.items("replicates")) return workflow
def PBC(workflow, conf): # PBC1 """ Introduce ENCODE II library complexity assessment methods N1 / Nd, N1 is the location with exact one read, Nd is distinct location number :param workflow: samflow class :param conf: parsed config :return: void """ for t in conf.sample_targets: pbc1 = attach_back( workflow, ShellCommand( """ bamToBed -i {input[bam]} | {tool} \'{{l[$1"\\t"$2"\\t"$3"\\t"$6]+=1}} END {{for(i in l) print l[i]}}\' \\ | awk \'{{n[$1]+=1}} END {{for (i in n) print i"\\t"n[i]}}\' \\ | sort -k1n - > {output[hist]} awk '{{ if (NR==1) {{N1=$2}} Nd+=$2 }} END {{print N1,Nd,N1/Nd}}' {output[hist]} > {output[pbc]} """, tool="awk", input={"bam": t + "_4000000.bam" if conf.down else t + ".bam"}, output={ "pbc": t + ".pbc", "hist": t + ".hist" }, name="PBC")) pbc1.allow_fail = True pbc1.allow_dangling = True ## QC part stat_pbc(workflow, conf)
def r_exec(jinja_template_r): ShellCommand("{tool} {input}", tool = "Rscript", name = 'Rscript', input=jinja_template_r.param["render_dump"], param={}, output=jinja_template_r.param["pdf"]).invoke()
def Phan(workflow, conf): # NSC, RSC, Qtag """ for calculating NSC, RSC score at 4M level http://code.google.com/p/phantompeakqualtools/ (1) Determine strand cross-correlation peak / predominant fragment length OR print out quality measures Rscript run_spp.R -c=<tagAlign/BAMfile> -savp -out=<outFile> """ # peaks calling by SPP needs control, for phantomqc, we do both treat and control independently for t in conf.sample_targets: if conf.down: ## default, this option ibam = t + "_4000000.bam" # elif conf.unsc: ## --total --unsc # ibam = t + "_rmdup.bam" else: ## --total ibam = t + ".bam" attach_back( workflow, ShellCommand( "{tool} {param[script]} -c={input[chip]} -rf -savp -out={output[spp]} -odir={param[dir]}", tool="Rscript", input={"chip": ibam}, output={ "spp": t + ".spp", "pdf": t + "_4000000.pdf" if conf.down else t + ".pdf" }, param={ "script": conf.get("tool", "spp"), "dir": os.path.dirname(t + ".spp") }, name="SPP")) stat_phan(workflow, conf) if conf.long: tex_phan(workflow, conf)
def DHS(workflow, conf): # DHS overlap percentage """ get peaks overlapping percentage with union DHS :param workflow: uniform pipeline workflow from samflow :param conf: parsed config files :return: workflow """ peaks = conf.prefix + "_sort_peaks.narrowPeak" if conf.get( "macs2", "type") in ["both", "narrow" ] else conf.prefix + "_b_sort_peaks.broadPeak" DHS = attach_back( workflow, ShellCommand(""" n=$(head -n {param[p]} {input[MACS2_bed]} | wc -l) dhs=$(head -n {param[p]} {input[MACS2_bed]} | {tool} -wa -u -a - -b {input[DHS_peaks_bed]}|wc -l) ##dhs=$(echo \"scale=5;$dhs/$n\" | bc) echo $n,$dhs > {output} """, tool="intersectBed", input={ "MACS2_bed": peaks, "DHS_peaks_bed": conf.get(conf.get("basics", "species"), "dhs") }, output=conf.prefix + ".dhs", param={"p": 5000}, name="intersect DHS")) DHS.allow_dangling = True DHS.allow_fail = True
def star(workflow, conf): # Mapping """ Use star to map reads to genome, call __bwa_sam2bam to convert sam to bam :param workflow: samflow defined class :param conf: parsed config files :return: void """ for target in conf.sample_targets: star = attach_back( workflow, ShellCommand( "{tool} --genomeDir {param[index]} --runThreadN {param[NUM_THREADS]} --readFilesIn {input[fastq]} --outFileNamePrefix {param[prefix]}", tool="STAR", input={"fastq": target + ".fastq"}, output={"sam": target + "Aligned.out.sam"}, param={ "NUM_THREADS": conf.threads, "prefix": target, ## judge chosen species from basics section "index": conf.get_path(conf.get("basics", "species"), "genome_index") }, name="star aln")) star.update(param=conf.items("bowtie")) _star_sam2bam(workflow, conf) ## QC part--NOTE keeping the bwa legacy code! stat_bwa(workflow, conf) if conf.long: tex_bwa(workflow, conf)
def fastqc(workflow, conf): """ fastqc to extract gc contents(not yet) and median sequence quality :param workflow: :param conf: :return: """ for raw, target in conf.sample_pairs: if conf.pe: fastqc_run = attach_back( workflow, ShellCommand( "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}", ## only check one pair input=target[0] + "_100k.fastq", output={ "target_dir": conf.target_dir, "fastqc_summary": target[0] + "_100k_fastqc/fastqc_data.txt" }, tool="fastqc", param={"threads": conf.threads}, name="fastqc")) else: fastqc_run = attach_back( workflow, ShellCommand( "{tool} {input} --extract -t {param[threads]} -o {output[target_dir]}", input=target + "_100k.fastq", output={ "target_dir": conf.target_dir, "fastqc_summary": target + "_100k_fastqc/fastqc_data.txt" }, tool="fastqc", param={"threads": conf.threads}, name="fastqc")) fastqc_run.update(param=conf.items("fastqc")) fastqc.allow_fail = True fastqc.allow_dangling = True ## QC part of chilin ## use conf property conf.long = True stat_fastqc(workflow, conf) if conf.long: tex_fastqc(workflow, conf)
def hotspotv4(workflow, conf, tex): for target in conf.treatment_targets: hotspot=attach_back(workflow, ShellCommand( "{tool} {param[hotspot_dir]} {param[genome]} {input[bam]} {param[readsize]} {output[narrowbb]} {output[broadbb]} {output[bigwig]} {param[tmp]} {output[hotspot_output]} {input[narrowas]} {input[broadas]} {param[chromsize]} {output[narrow]} {output[broad]}", tool = "eap_run_hotspot", input = {"bam": target + "_final_nochrm.bam", "narrowas": narrow, "broadas": broad}, output = {"narrowbb": target + ".narrowPeak.bigBed", "broadbb": target + ".broadPeak.bigBed", "narrow": target + ".narrowPeak", # "qc1": target + ".narrowPeak.qc", # "qc2": target + ".broadPeak.qc", "broad": target + ".broadPeak", "bigwig": target + ".bigWig", "hotspot_output": target + "_hotspot"}, param = {"hotspot_dir": conf.get("tool", "peak_calling"), "genome": conf.species, "chromsize": conf.get(conf.species, "chrom_len"), "tmp": target + "_hotspot_peak_call_tmp", "readsize": 36})) have_treat_reps = len(conf.treatment_pairs) >= 2 ## replicates if have_treat_reps: eval_reps(workflow, conf, tex) catsam = attach_back(workflow, ShellCommand( "{tool} cat {param[bams]} > {output[bam]}", tool = "samtools", input ={"bams": [ target + "_final.bam" for target in conf.treatment_targets]}, output = {"bam": conf.prefix + "_pool.bam"})) catsam.param.update(bams=' '.join(catsam.input["bams"])) hotspot_merge = hotspot.clone hotspot_merge.param.update(tmp=conf.prefix+"_hotspot_peak_call_tmp") hotspot_merge.input.update(bam = conf.prefix + "_pool.bam") hotspot_merge.output ={"narrowbb": conf.prefix + ".narrowPeak.bigBed", "broadbb": conf.prefix + ".broadPeak.bigBed", "narrow": conf.prefix + ".narrowPeak", # "qc1": conf.prefix + ".narrowPeak.qc", # "qc2": conf.prefix + ".broadPeak.qc", "broad": conf.prefix + ".broadPeak", "bigwig": conf.prefix + ".bigWig", "hotspot_output": conf.prefix + "_hotspot"} attach_back(workflow, hotspot_merge)
def make_link_command(orig, dest): """ link original input to destination files :param orig: input :param dest: link symbol :return: ShellCommand Class ln has machine type problem """ ## not use symbol link return ShellCommand("cp -fr {input} {output}", input=orig, output=dest, name="copy")
def merge_bams(workflow, conf): ## merge input and chip bam """ input multiple input and multiple control to merge into one file separately :return: """ # merge all treatments into one merge_bams_treat = ShellCommand( "{tool} merge {output[merged]} {param[bams]}", tool="samtools", input=[target + ".bam" for target in conf.treatment_targets], output={"merged": conf.prefix + "_treatment.bam"}) merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)} merge_bams_treat.allow_fail = True merge_bams_treat.allow_dangling = True if len(conf.treatment_targets) > 1: attach_back(workflow, merge_bams_treat) else: # when there's only one treatment sample, use copying instead of merging attach_back( workflow, make_link_command(merge_bams_treat.input[0], merge_bams_treat.output["merged"])) # merging step will be skipped if control sample does not exist # So be careful to check whether there are control samples before using `_control.bam` if len(conf.control_targets) > 1: merge_bams_control = merge_bams_treat.clone merge_bams_control.input = [ target + ".bam" for target in conf.control_targets ] merge_bams_control.output = {"merged": conf.prefix + "_control.bam"} merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)} attach_back(workflow, merge_bams_control) elif len(conf.control_targets) == 1: attach_back( workflow, make_link_command(conf.control_targets[0] + ".bam", conf.prefix + "_control.bam"))
def sampling(orig, dest, rand, format, conf): # call fastq_sampling """ prepare sampling fastq files for library contamination and fastqc rand: the number of random selected fastq reads use lh3's https://github.com/lh3/seqtk/ to sample fastq and fastq.gz """ if format == "fastq": #return PythonCommand(fastq_sampling, # input=orig, # output=dest, # param={"random_number": rand}) ## faster and support fastq.gz ## if paired end, we must use same -s return ShellCommand("{tool} sample -s 11 {input[fastq]} {param[rand]} > {output[fastq_sample]}", tool = "seqtk", input = orig, output = dest, param = {"rand": 100000}) elif format == "sam": ## samtools sampling ## add judge condition return ShellCommand(""" count=$({tool} view -Sc {input[sam]}) ## judge mapped reads number less than sampling number if [ $count -le {param[random_number]} ] then ln -f {input[sam]} {input[sam]}.{param[random_number]} {tool} view -bS {input[sam]}.{param[random_number]} > {output[samp]} else sampling_pe_sam.py {input[sam]} {param[random_number]} {param[pair]} {tool} view -bS {input[sam]}.{param[random_number]} > {output[samp]} fi """, tool = "samtools", input={"sam": orig}, output={"samp": dest}, param={"random_number": rand, "pair": str(conf.pe)}, name = "sampling bam")
def fragment(workflow, conf): ## this is done after FRiP if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" for target in conf.treatment_targets: fragment_size = attach_back( workflow, ShellCommand( "{tool} predictd -i {input[bam]} --rfile {param[prefix]} -g {param[species]}", tool=macs2_bin, input={"bam": target + ".bam"}, output={"R": target + "_model.R"}, param={ "prefix": target + "_model.R", "species": 'hs' })) fragment_size.update(param=conf.items("macs2")) ## except too few peaks for modeling fragment_size.allow_fail = True fragment_size.allow_dangling = True ## extract standard deviation from MACS2 model.R, ## use m, p, and pileup value for standard deviation; mean fragment size is provided (choose the one with highest correlation) frag_qc = attach_back( workflow, PythonCommand( stat_frag_std, input={ "r": [target + "_model.R" for target in conf.treatment_targets] }, output={ "json": conf.json_prefix + "_frag.json", "r": [target + "_frag_sd.R" for target in conf.treatment_targets] }, param={ "samples": conf.treatment_bases, "frag_tool": "BAMSE" }, name="macs2 model R script parser")) frag_qc.allow_fail = True frag_qc.allow_dangling = True
def bedAnnotate_ceas(workflow, conf): """ Calls bedAnnotate to get the genome distribution of the summits """ import os summits = conf.prefix + "_sort_summits.bed" if conf.get( "macs2", "type") in ["both", "narrow" ] else conf.prefix + "_b_sort_peaks.broadPeak" ceas = attach_back( workflow, ShellCommand( """{tool} -g {param[geneTable]} -b {input} -e {output[exon]} -t {output[gene]}> {output[meta]} meta_info.sh {output[gene]} {output[exon]} 2000 {param[chrominfo]} """, tool="bedAnnotate.py", input=summits, output={ "meta": conf.prefix + ".meta", "gene": os.path.join(conf.target_dir, "gene.bed"), "exon": os.path.join(conf.target_dir, "exon.bed"), "promoter": os.path.join(conf.target_dir, "gene.bed_promoter"), "exon": os.path.join(conf.target_dir, "gene.bed_exon") }, param={ "geneTable": conf.get_path(conf.get("basics", "species"), "geneTable"), "chrominfo": conf.get_path(conf.get("basics", "species"), "chrom_len") }, name="bedAnnotate (ceas)")) try: has_velcro = conf.get(conf.get("basics", "species"), "velcro") has_dhs = conf.get(conf.get("basics", "species"), "dhs") except: has_velcro = "" has_dhs = "" ceas.allow_fail = True ceas.allow_dangling = True if has_dhs: DHS(workflow, conf) if has_velcro: velcro(workflow, conf) stat_bedAnnotate(workflow, conf, has_dhs, has_velcro)
def FRiP(workflow, conf): # FRiP """ Fraction of Reads in Peaks regions at 4M reads level For example: 2 treat, 2 control modify: without down sampling read peaks calling, use merged peaks for comparison """ ## use merged peaks for evaluation after removing chrM reads for t in conf.sample_targets: if conf.frip: ## sampling 5M reads reads = t + "_5000000_nochrM.bam" else: reads = t + "_4000000_nochrM.bam" frip = attach_back( workflow, ShellCommand(""" fr=$(bedtools intersect -f {param[p]} -wa -u -abam {input[reads]} -b {input[peaks]} -bed | wc -l) total=$(samtools flagstat {input[reads]} | head -1 | cut -d" " -f1) echo $fr,$total > {output[frip]} """, tool="intersectBed", input={ "reads": reads if conf.down else t + "_nochrM.bam", "peaks": conf.prefix + "_sort_peaks.narrowPeak" if conf.get("macs2", "type") in ["both", "narrow"] else conf.prefix + "_b_sort_peaks.broadPeak" }, output={"frip": t + ".frip"}, param={"p": "1E-9"}, name="FRiP score")) ## in case that peaks calling on 4M reads may be very poor, ## no peaks generated, allow fail and dangling frip.allow_fail = True frip.allow_dangling = True frip.update(param=conf.items("bedtools")) ## QC part stat_frip(workflow, conf) if conf.long: tex_frip(workflow, conf)
def bowtie(workflow, conf, target, output, index): # Mapping """ Use bowtie to map reads to genome, """ bowtie = attach_back( workflow, ShellCommand( "{tool} -p {param[NUM_THREADS]} -S -m 1 {param[index]} {input[fastq]} {output[sam]}", tool="bowtie", input={"fastq": target + "_100k.fastq"}, output={"sam": output}, param={ "NUM_THREADS": conf.threads, ## judge chosen species from basics section "index": index }, name="bowtie aln")) bowtie.update(param=conf.items("bowtie")) bowtie.allow_fail = True bowtie.allow_dangling = True return workflow
def eval_reps(workflow, conf, tex): peaks = [ target + ".narrowPeak" for target in conf.treatment_targets ] attach_back(workflow, ShellCommand( """ cat {param[narrowPeaks]} | sort -k1,1 -k2,2n - | bedtools merge -i - > {output[mergedPeak]} bedToBigBed {output[mergedPeak]} {param[chromsize]} {output[mergedPeakbb]} bigWigCorrelate -restrict={output[mergedPeakbb]} {param[bigwigs]} 1>{output[qc1]} {tool} {param[narrowPeaksbb]} {output[qc2]} """, tool = "edwComparePeaks", input = {"narrowPeaks": peaks, "bigwigs": [ target + ".bigWig" for target in conf.treatment_targets ], "narrowPeakbbs": [ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]}, output = {"mergedPeak": conf.prefix + "_merge.bed", "mergedPeakbb": conf.prefix + "_merged.bigBed", "qc1": conf.prefix + "_cor.qc", "qc2": conf.prefix + "_overlap.qc"}, param = {"narrowPeaksbb": " ".join([ target + ".narrowPeak.bigBed" for target in conf.treatment_targets ]), "narrowPeaks": " ".join([ target + ".narrowPeak" for target in conf.treatment_targets ]), "bigwigs": " ".join([ target + ".bigWig" for target in conf.treatment_targets ]), "chromsize": conf.get(conf.species, "chrom_len")}))
def star(workflow, conf, target, output, index): # Mapping """ Use star to map reads to genome, """ star = attach_back( workflow, ShellCommand( "{tool} --genomeDir {param[index]} --runThreadN {param[NUM_THREADS]} --readFilesIn {input[fastq]} --outFileNamePrefix {param[prefix]}", tool="STAR", input={"fastq": target + "_100k.fastq"}, output={"sam": output}, param={ "NUM_THREADS": conf.threads, "prefix": target, ## judge chosen species from basics section "index": index }, name="star aln")) star.update(param=conf.items("bowtie")) star.allow_dangling = True star.allow_fail = True return workflow
def replicates_bw_correlation(workflow, conf): ## correlation among different replicates """ Use UCSC binary bigWigCorrelate to calculate reads density correlation collections in json files from qc :param workflow: samflow class :param conf: parsed config files :return: void """ replicates_correlation = attach_back( workflow, ShellCommand( "{tool} {param[input_list]} > {output}", tool="wigCorrelate", input=[target + "_treat.bw" for target in conf.treatment_targets], output=conf.prefix + ".cor", param={"input_list": []}, name="correlation between bigwiggle")) replicates_correlation.update( param={"input_list": " ".join(replicates_correlation.input)}) replicates_correlation.allow_fail = True # in case 0 peak in macs2 replicates_correlation.allow_dangling = True
def render_pdf(workflow, conf, long=True): latex_environ(workflow, conf) summary_table_latex(workflow, conf) merge_latex(workflow, conf) render = attach_back( workflow, ShellCommand( # Somehow the pdflatex has to be invoked twice.. "{tool} -output-directory {output[dir]} -jobname={param[name]} {input} \ && {tool} -output-directory {output[dir]} -jobname={param[name]} {input}", tool="pdflatex", input=conf.prefix + "_report.tex", # output[pdf] should use "conf.prefix" to have the absolute path output={ "dir": conf.target_dir, "pdf": conf.prefix + "_report.pdf" }, # param[name] should use "conf.id" to avoid using absolute path param={"name": conf.id + "_report"}, name="report")) render.allow_fail = True
def stat_bwa(workflow, conf): ## use samtools to parse mappable reads from bwa """ bam files are filtered by samtools -q 1, so mapped reads are considered to be unique """ for t in conf.sample_targets: stat = attach_back(workflow, ShellCommand( """ {tool} view -Sc {input[sam]} > {output[total]} {tool} flagstat {input[bam]} > {output[stat]} """, tool = "samtools", input = {"bam": t + ".bam", "sam": t + ".sam"}, output = {"stat": t + "_mapped.bwa", "total": t + "_total.bwa"})) stat.allow_fail = True stat.allow_dangling = True collect = attach_back(workflow, PythonCommand(json_bwa, input={"bwa_mapped": [ t + "_mapped.bwa" for t in conf.sample_targets ], "bwa_total": [ t + "_total.bwa" for t in conf.sample_targets ]}, output={"json": conf.json_prefix+"_map.json"}, param={"sample":conf.sample_bases}, name="bwa qc")) collect.allow_dangling = True collect.allow_fail = True if conf.long: long_collect = attach_back(workflow, PythonCommand(bwa_figures, input = {"dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"), "json": conf.json_prefix + "_map.json", "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R")}, output = {"pdf": conf.prefix + "_bwa_compare.pdf", "R": conf.prefix+"_bwa_compare.R"}, param = {"sample": conf.sample_bases})) long_collect.allow_fail = True long_collect.allow_fail = True
def velcro(workflow, conf): vel = attach_back( workflow, ShellCommand( """ n=$(head -n {param[p]} {input[MACS2_bed]} | wc -l) velcro=$(head -n {param[p]} {input[MACS2_bed]} | {tool} -wa -u -a - -b {input[velcro_peaks_bed]} | wc -l) velcro=$(echo \"scale=5;$velcro/$n\" | bc) echo $velcro > {output} """, tool="intersectBed", input={ "MACS2_bed": conf.prefix + "_sort_peaks.narrowPeak" if conf.get( "macs2", "type") in ["both", "narrow"] else conf.prefix + "_b_sort_peaks.broadPeak", "velcro_peaks_bed": conf.get(conf.get("basics", "species"), "velcro") }, output=conf.prefix + ".velcro", param={"p": 5000}, name="velcro overlap")) vel.allow_fail = True vel.allow_dangling = True
def _macs2(workflow, conf): # merge all treatments into one merge_bams_treat = ShellCommand( "{tool} merge {output[merged]} {param[bams]}", tool="samtools", input=[target + ".bam" for target in conf.treatment_targets], output={"merged": conf.prefix + "_treatment.bam"}) merge_bams_treat.param = {"bams": " ".join(merge_bams_treat.input)} if len(conf.treatment_targets) > 1: attach_back(workflow, merge_bams_treat) else: # when there's only one treatment sample, use copying instead of merging attach_back(workflow, make_copy_command(merge_bams_treat.input[0], merge_bams_treat.output["merged"])) # merging step will be skipped if control sample does not exist # So be careful to check whether there are control samples before using `_control.bam` if len(conf.control_targets) > 1: merge_bams_control = merge_bams_treat.clone merge_bams_control.input = [target + ".bam" for target in conf.control_targets] merge_bams_control.output = {"merged": conf.prefix + "_control.bam"} merge_bams_control.param = {"bams": " ".join(merge_bams_control.input)} attach_back(workflow, merge_bams_control) elif len(conf.control_targets) == 1: attach_back(workflow, make_copy_command(conf.control_targets[0] + ".bam", conf.prefix + "_control.bam")) macs2_on_merged = attach_back(workflow, ShellCommand( "{tool} callpeak -B -q 0.01 --keep-dup {param[keep_dup]} --shiftsize={param[shiftsize]} --nomodel \ {param[treat_opt]} {param[control_opt]} -n {param[description]}", tool="macs2", input={"treat": conf.prefix + "_treatment.bam"}, output={"peaks": conf.prefix + "_peaks.bed", "summit": conf.prefix + "_summits.bed", "treat_bdg": conf.prefix + "_treat_pileup.bdg", "ENCODE": conf.prefix + "_peaks.encodePeak", "peaks_xls": conf.prefix + "_peaks.xls", "control_bdg": conf.prefix + "_control_lambda.bdg"}, param={"description": conf.prefix, "keep_dup": 1, "shiftsize": 73}, name="macs2_callpeak_merged")) macs2_on_merged.param["treat_opt"] = "-t " + macs2_on_merged.input["treat"] # control option is skipped if control samples does not exist if len(conf.control_targets) >= 1: macs2_on_merged.input["control"] = conf.prefix + "_control.bam" macs2_on_merged.param["control_opt"] = "-c " + macs2_on_merged.input["control"] else: macs2_on_merged.param["control_opt"] = "" macs2_on_merged.update(param=conf.items("macs2")) # For bedGraphToBigwiggle bugs, we need to remove coordinates over-border coordinates # As _control_lambda.bdg always exist. There are no need to check whether there are control samples. bdg_trim_control = attach_back(workflow, ShellCommand( '{tool} intersect -a {input[bdg]} -b {input[chrom_bed]} -wa -f 1.00 > {output}', tool="bedtools", input={"bdg": conf.prefix + "_control_lambda.bdg", 'chrom_bed': conf.get_path("lib", "chrom_bed")}, output=conf.prefix + "_control_lambda.bdg.tmp", name="bedGraph filtering")) bdg_trim_treat = bdg_trim_control.clone bdg_trim_treat.input["bdg"] = conf.prefix + "_treat_pileup.bdg" bdg_trim_treat.output = conf.prefix + "_treat_pileup.bdg.tmp" attach_back(workflow, bdg_trim_treat) bdg2bw_treat = attach_back(workflow, ShellCommand( "{tool} {input[bdg]} {input[chrom_len]} {output[bw]}", tool="bedGraphToBigWig", input={"bdg": conf.prefix + "_control_lambda.bdg.tmp", "chrom_len": conf.get("lib", "chrom_len")}, output={"bw": conf.prefix + "_control.bw"}, name="bdg_to_bw")) # prototype used here to do the similar thing on treatment file bdg2bw_control = bdg2bw_treat.clone bdg2bw_control.input["bdg"] = conf.prefix + "_treat_pileup.bdg.tmp" bdg2bw_control.output["bw"] = conf.prefix + "_treat.bw" attach_back(workflow, bdg2bw_control) attach_back(workflow, PythonCommand( stat_macs2, input={"macs2_peaks_xls": conf.prefix + "_peaks.xls", "db": ChiLinQC_db, "template": rlang_template}, output={"json": conf.json_prefix + "_macs2.json", "R": conf.prefix + "_macs2.R", "pdf": conf.prefix + "_macs2.pdf"}, param={"id": conf.id}, name="MACS2 summary"))
def test_invoke_collect_output(self): echo_cmd = ShellCommand("echo test_collect").set_stdout_collecting() self.assertTrue(echo_cmd.invoke()) self.assertEqual(echo_cmd.result, "test_collect\n")
def test_invoke_non_exist_input(self): non_exist_input_cmd = ShellCommand("cat < {input}", input="non_exist_file") self.assertFalse(non_exist_input_cmd.invoke())
def macs2_rep(workflow, conf): # Though macs command already exists, I choose not to use prototype here # Because the prototype definition and usage might be far from each other, making codes not readable if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" format = " -f BAMPE " if conf.pe else " " for target in conf.treatment_targets: ## DNase, H3K4, H2AZ, all acetyl marks, or TF if conf.get("macs2", "type").lower() in ["both", "narrow"]: ## for DNase, H3K4, H2AZ, all acetyl marks, or TF macs2_on_rep_narrow = attach_back(workflow, ShellCommand( """ {tool} callpeak --SPMR -B -q {param[fdr]} --keep-dup {param[keep_dup]} --extsize={param[extsize]} --nomodel -g {param[species]} {param[format]} {param[treat_opt]} {param[control_opt]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]} ## remove weird path characters cp {output[peaks]} {output[peakstmp]} cp {output[summits]} {output[summitstmp]} awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[summitstmp]} > {output[summits]} """, tool=macs2_bin, input={"treat": target + ".bam"}, output={"peaks": target + "_peaks.narrowPeak", "peakstmp": target + "_peaks.narrowPeak.tmp", "summits": target + "_summits.bed", "summitstmp": target + "_summits.bed.tmp", "bed": target + "_peaks.bed", "bedtmp": target + "_peaks.bed.tmp", "treat_bdg": target + "_treat_pileup.bdg", "peaks_xls": target + "_peaks.xls", "control_bdg": target + "_control_lambda.bdg"}, param={"description": target, "keep_dup": 1, "extsize": 73*2, "species": "hs", "fdr":0.01, "format": format}, name="macs2_callpeak_rep")) macs2_on_rep_narrow.param["treat_opt"] = "-t " + macs2_on_rep_narrow.input["treat"] sort = attach_back(workflow, ShellCommand( "{tool} -r -g -k 9 {input} > {output}", tool = "sort", input = target + "_peaks.narrowPeak", output = target + "_sort_peaks.narrowPeak")) # control option is skipped if control samples does not exist if len(conf.control_targets) >= 1: macs2_on_rep_narrow.input["control"] = conf.prefix + "_control.bam" macs2_on_rep_narrow.param["control_opt"] = "-c " + macs2_on_rep_narrow.input["control"] else: macs2_on_rep_narrow.param["control_opt"] = "" macs2_on_rep_narrow.update(param=conf.items("macs2")) macs2_on_rep_narrow.allow_dangling = True macs2_on_rep_narrow.allow_fail = True if conf.get("macs2", "type").lower() in ["both", "broad"]: # K9, K36, K79 and K27 methylation, both for chromatin regulator, all other histone marks macs2_on_rep_broad = attach_back(workflow, ShellCommand( """ {tool} callpeak --SPMR -B -q {param[fdr]} {param[treat_opt]} {param[control_opt]} --keep-dup {param[keep_dup]} --broad --broad-cutoff {param[fdr]} -g {param[species]} {param[format]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]} ## remove weird path characters cp {output[peaks]} {output[peakstmp]} awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]} """, tool=macs2_bin, input = {"treat": target + ".bam"}, output = {"peaks": target + "_b_peaks.broadPeak", "peakstmp": target + "_b_peaks.broadPeak.tmp", "bed": target + "_b_peaks.bed", "bedtmp": target + "_b_peaks.bed.tmp", "treat_bdg": target + "_b_treat_pileup.bdg", "peaks_xls": target + "_b_peaks.xls", "control_bdg": target + "_b_control_lambda.bdg"}, param = {"description": target + "_b", "species": "hs", "format": format, "fdr": 0.01}, name = " macs2 broad peaks")) macs2_on_rep_broad.param["treat_opt"] = " -t " + macs2_on_rep_broad.input["treat"] macs2_on_rep_broad.update(param=conf.items("macs2")) macs2_on_rep_broad.allow_dangling = True macs2_on_rep_broad.allow_fail=True if len(conf.control_targets) >= 1: macs2_on_rep_broad.input["control"] = conf.prefix + "_control.bam" macs2_on_rep_broad.param["control_opt"] = "-c " + macs2_on_rep_broad.input["control"] else: macs2_on_rep_broad.param["control_opt"] = "" ## some broad peaks cannot be called macs2_on_rep_broad.update(param=conf.items("macs2")) sort = attach_back(workflow, ShellCommand( "{tool} -r -g -k 9 {input} > {output}", tool = "sort", input = target + "_b_peaks.broadPeak", output = target + "_b_sort_peaks.broadPeak", name = "sort broad peaks")) sort.allow_dangling = True sort.allow_fail=True ## For bedGraphToBigwiggle bugs, we need to remove coordinates outlier if conf.get("macs2", "type").lower() in ["both", "broad"]: cont_bdg = target + "_b_control_lambda.bdg" treat_bdg = target + "_b_treat_pileup.bdg" if conf.get("macs2", "type").lower() in ["both", "narrow"]: cont_bdg = target + "_control_lambda.bdg" treat_bdg = target + "_treat_pileup.bdg" import os bdg_trim_controlrep = attach_back(workflow, ShellCommand( '{tool} intersect -a {input} -b {param[chrom_bed]} -wa -f 1.00 > {output}', tool="bedtools", input=cont_bdg, output=cont_bdg + ".tmp", param={"chrom_bed": os.path.join(conf.target_dir, "chrom.bed")}, name="bedGraph control replicate filtering")) bdg_trim_controlrep.allow_dangling = True bdg_trim_controlrep.allow_fail=True bdg_trim_treatrep = bdg_trim_controlrep.clone bdg_trim_treatrep.input = treat_bdg bdg_trim_treatrep.output = treat_bdg + ".tmp" bdg_trim_treatrep.allow_dangling = True bdg_trim_treatrep.allow_fail=True attach_back(workflow, bdg_trim_treatrep) bdg2bw_treatrep = attach_back(workflow, ShellCommand( "{tool} {input} {param[chrom_len]} {output}", tool="bedGraphToBigWig", input=treat_bdg+".tmp", output=target + "_treat.bw", param={"chrom_len": conf.get_path(conf.get("basics", "species"), "chrom_len")}, name="bdg_to_bw treat")) ## in case broad peaks calling failed bdg2bw_treatrep.allow_dangling = True bdg2bw_treatrep.allow_fail=True # prototype used here to do the similar thing on treatment file bdg2bw_controlrep = bdg2bw_treatrep.clone bdg2bw_controlrep.input = cont_bdg + ".tmp" bdg2bw_controlrep.output = target + "_control.bw" attach_back(workflow, bdg2bw_controlrep) ## in case broad peaks calling failed bdg2bw_controlrep.allow_dangling = True bdg2bw_controlrep.allow_fail=True stat_macs2_on_rep(workflow, conf)
def macs2(workflow, conf): if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" format = "-f BAMPE" if conf.pe else " " if conf.get("macs2", "type").lower() in ["both", "narrow"]: ## for DNase, H3K4, H2AZ, all acetyl marks, or TF macs2_on_merged_narrow = attach_back(workflow, ShellCommand( """ {tool} callpeak --SPMR -B -q {param[fdr]} --keep-dup {param[keep_dup]} --extsize={param[extsize]} --nomodel -g {param[species]} {param[format]} {param[treat_opt]} {param[control_opt]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]} ## remove weird path characters cp {output[peaks]} {output[peakstmp]} cp {output[summits]} {output[summitstmp]} awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[summitstmp]} > {output[summits]} """, tool=macs2_bin, input={"treat": conf.prefix + "_treatment.bam"}, output={"peaks": conf.prefix + "_peaks.narrowPeak", "peakstmp": conf.prefix + "_peaks.narrowPeak.tmp", "bed": conf.prefix + "_peaks.bed", "bedtmp": conf.prefix + "_peaks.bed.tmp", "summits": conf.prefix + "_summits.bed", "summitstmp": conf.prefix + "_summits.bed.tmp", "treat_bdg": conf.prefix + "_treat_pileup.bdg", "peaks_xls": conf.prefix + "_peaks.xls", "control_bdg": conf.prefix + "_control_lambda.bdg"}, param={"description": conf.prefix, "keep_dup": 1, "format": format, "extsize": 73 * 2, # extsize=2*shiftsize "fdr": 0.01, "species": "hs"}, name="macs2_callpeak_merged")) macs2_on_merged_narrow.param["treat_opt"] = "-t " + macs2_on_merged_narrow.input["treat"] # control option is skipped if control samples does not exist if len(conf.control_targets) >= 1: macs2_on_merged_narrow.input["control"] = conf.prefix + "_control.bam" macs2_on_merged_narrow.param["control_opt"] = "-c " + macs2_on_merged_narrow.input["control"] else: macs2_on_merged_narrow.param["control_opt"] = "" macs2_on_merged_narrow.update(param=conf.items("macs2")) sort = attach_back(workflow, ShellCommand( """{tool} -r -g -k 9 {input[peaks]} > {output[p]} {tool} -r -g -k 5 {input[summits]} > {output[s]} """, tool = "sort", input = {"peaks": conf.prefix + "_peaks.narrowPeak", "summits": conf.prefix + "_summits.bed"}, output = {"p":conf.prefix + "_sort_peaks.narrowPeak", "s":conf.prefix + "_sort_summits.bed"}, name = "sort peaks")) macs2_on_merged_narrow.allow_fail = True macs2_on_merged_narrow.allow_dangling = True sort.allow_fail = True sort.allow_dangling = True if conf.get("macs2", "type").lower() in ["both", "broad"]: # K9, K36, K79 and K27 methylation, both for chromatin regulator, all other histone marks macs2_on_merged_broad = attach_back(workflow, ShellCommand( """ {tool} callpeak --SPMR -B -q {param[fdr]} {param[treat_opt]} {param[control_opt]} --keep-dup {param[keep_dup]} --broad --broad-cutoff {param[fdr]} -g {param[species]} {param[format]} -n {param[description]} && cut -f1,2,3,4,9 {output[peaks]} > {output[bedtmp]} ## remove weird path characters cp {output[peaks]} {output[peakstmp]} awk \'{{OFS="\\t";n+=1;$4="peak"n;print $0}}\' {output[peakstmp]} > {output[peaks]} awk \'{{OFS="\\t";n+=1;$4=n;print $1,$2,$3,"peak"$4,$5}}\' {output[bedtmp]} > {output[bed]} """, tool=macs2_bin, input = {"treat": conf.prefix + "_treatment.bam"}, output = {"peaks": conf.prefix + "_b_peaks.broadPeak", "peakstmp": conf.prefix + "_b_peaks.broadPeak.tmp", "bed": conf.prefix + "_b_peaks.bed", "bedtmp": conf.prefix + "_b_peaks.bed.tmp", "treat_bdg": conf.prefix + "_b_treat_pileup.bdg", "peaks_xls": conf.prefix + "_b_peaks.xls", "control_bdg": conf.prefix + "_b_control_lambda.bdg"}, param = {"description": conf.prefix + "_b", "species": "hs", "format": format, "keep_dup": 1, "fdr": 0.01}, name = "broad peaks calling")) macs2_on_merged_broad.param["treat_opt"] = " -t " + macs2_on_merged_broad.input["treat"] macs2_on_merged_broad.allow_fail = True macs2_on_merged_broad.allow_dangling = True if len(conf.control_targets) >= 1: macs2_on_merged_broad.input["control"] = conf.prefix + "_control.bam" macs2_on_merged_broad.param["control_opt"] = "-c " + macs2_on_merged_broad.input["control"] else: macs2_on_merged_broad.param["control_opt"] = "" macs2_on_merged_broad.update(param=conf.items("macs2")) sort = attach_back(workflow, ShellCommand( """{tool} -r -g -k 9 {input[peaks]} > {output[p]} """, tool = "sort", input = {"peaks": conf.prefix + "_b_peaks.broadPeak"}, output = {"p": conf.prefix + "_b_sort_peaks.broadPeak"}, name = "sort peaks files")) sort.allow_dangling = True sort.allow_fail = True # For bedGraphToBigwiggle bugs, we need to remove coordinates over-border coordinates # As _control_lambda.bdg always exist. There are no need to check whether there are control samples. if conf.get("macs2", "type").lower() in ["both", "broad"]: cont_bdg = conf.prefix + "_b_control_lambda.bdg" treat_bdg = conf.prefix + "_b_treat_pileup.bdg" if conf.get("macs2", "type").lower() in ["both", "narrow"]: cont_bdg = conf.prefix + "_control_lambda.bdg" treat_bdg = conf.prefix + "_treat_pileup.bdg" import os bdg_trim_control = attach_back(workflow, ShellCommand( '{tool} intersect -a {input[bdg]} -b {param[chrom_bed]} -wa -f 1.00 > {output}', tool="bedtools", input={"bdg": cont_bdg}, param = {"chrom_bed": os.path.join(conf.target_dir, "chrom.bed")}, output=cont_bdg+".tmp", name="bedGraph filtering control")) bdg_trim_control.fail = True bdg_trim_control.allow_dangling = True bdg_trim_treat = bdg_trim_control.clone bdg_trim_treat.input["bdg"] = treat_bdg bdg_trim_treat.output = treat_bdg + ".tmp" bdg_trim_treat.fail = True bdg_trim_treat.allow_dangling = True attach_back(workflow, bdg_trim_treat) bdg2bw_treat = attach_back(workflow, ShellCommand( "{tool} {input[bdg]} {input[chrom_len]} {output[bw]}", tool="bedGraphToBigWig", input={"bdg": cont_bdg+".tmp", "chrom_len": conf.get_path(conf.get("basics", "species"), "chrom_len")}, output={"bw": conf.prefix + "_control.bw"}, name="bdg_to_bw control")) ## in case broad peaks failed bdg2bw_treat.allow_fail = True bdg2bw_treat.allow_dangling = True # prototype used here to do the similar thing on treatment file bdg2bw_control = bdg2bw_treat.clone bdg2bw_control.input["bdg"] = treat_bdg+".tmp" bdg2bw_control.output["bw"] = conf.prefix + "_treat.bw" ## in case broad peaks failed bdg2bw_control.allow_fail = True bdg2bw_control.allow_dangling = True attach_back(workflow, bdg2bw_control) stat_macs2(workflow, conf)
def test_invoke_non_exist_output(self): non_exist_output_cmd = ShellCommand("echo tempfile3", output="tempfile3") self.assertFalse(non_exist_output_cmd.invoke())
def test_invoke_dangling_tool(self): dangling_tool_command = ShellCommand("{tool} fun", tool="wolfyp") self.assertFalse(dangling_tool_command.invoke())