def latex_environ(workflow, conf): """ write out begin and end document including packages """ attach_back( workflow, PythonCommand(latex_start, input={ "template": resource_filename("chilin2.modules.summary", "begin.tex") }, output={"latex": conf.latex_prefix + "_begin.tex"}, param={ "id": conf.id, "version": conf.get("basics", "version"), "user": conf.get('basics', 'user'), "bmcard": resource_filename("chilin2.modules.summary", "bmcart.cls").rstrip('.cls') })) attach_back( workflow, PythonCommand(latex_end, input={ "template": resource_filename("chilin2.modules.summary", "end.tex") }, output={"latex": conf.latex_prefix + "_end.tex"}))
def stat_fastqc(workflow, conf): # collect raw reads quality and GC contents """ long: generate long pages or not """ sums = [] for raw, target in conf.sample_pairs: if conf.pe: sums.append(target[0] + "_100k_fastqc/fastqc_data.txt") else: sums.append(target + "_100k_fastqc/fastqc_data.txt") collect = attach_back( workflow, PythonCommand(json_fastqc, input={"fastqc_summaries": sums}, output={"json": conf.json_prefix + "_fastqc.json"}, param={ "ids": conf.sample_bases, "id": conf.id }, name="collect fastqc results")) collect.allow_fail = True collect.allow_dangling = True if conf.long: ## prepare long document images and tex long_collect = attach_back( workflow, PythonCommand(fastqc_detailed_figure, name='fastqc', input={ "dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"), "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R"), "json": conf.json_prefix + "_fastqc.json" }, output={ "R": conf.prefix + "_raw_sequence_qc.R", "pdf": conf.prefix + "_raw_sequence_qc.pdf" }, param={"ids": conf.sample_bases})) long_collect.allow_fail = True long_collect.allow_dangling = True
def stat_pbc(workflow, conf): # collect pbc value """ statistics collected from *.pbc """ attach_back( workflow, PythonCommand(json_pbc, input={"pbc": [t + ".pbc" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_pbc.json"}, param={"samples": conf.sample_bases}))
def tex_fastqc(workflow, conf): quality = attach_back( workflow, PythonCommand(load_latex, input={ "json": conf.json_prefix + "_fastqc.json", "template": resource_filename("chilin2.modules.fastqc", "fastqc.tex"), "pdf": conf.prefix + "_raw_sequence_qc.pdf" }, output={"latex": conf.latex_prefix + "_fastqc.tex"})) quality.allow_fail = True quality.allow_dangling = True #these are name, png pairings if not conf.pe: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%s_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] else: gccontent_graphs = [(nm.replace("_"," "), os.path.join(conf.target_dir, "%spair1_100k_fastqc" % nm, "Images","per_sequence_gc_content.png"))\ for nm in conf.sample_bases] gc = attach_back( workflow, PythonCommand(load_gc_latex, input={ "template": resource_filename("chilin2.modules.fastqc", "fastqc_gc.tex"), "gccontent_graphs": gccontent_graphs }, output={"latex": conf.latex_prefix + "_fastqc_gc.tex"})) gc.allow_fail = True gc.allow_dangling = True
def stat_phan(workflow, conf): """ collect NSC/RSC/Qtag and cross correlation figure """ attach_back( workflow, PythonCommand(json_phan, input={"spp": [t + ".spp" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_phan.json"}, param={"sample": conf.sample_bases}))
def tex_conserv(workflow, conf): tex = attach_back( workflow, PythonCommand(latex_conservation, input={ "template": resource_filename("chilin2.modules.conservation", "conservation.tex") }, output={"latex": conf.latex_prefix + "_conserv.tex"}, param={"prefix": conf.prefix})) tex.allow_dangling = True tex.allow_fail = True
def tex_bwa(workflow, conf): tex = attach_back( workflow, PythonCommand(long_tex, input={ "template": resource_filename("chilin2.modules.bwa", "bwa.tex"), "figure": conf.prefix + "_bwa_compare.pdf" }, output={"latex": conf.latex_prefix + "_map.tex"})) tex.allow_fail = True tex.allow_dangling = True
def stat_frip(workflow, conf): # collect frip score """ collect FRiP informative tag number and effective peaks number """ stat = attach_back( workflow, PythonCommand( json_frip, input={"frip": [t + ".frip" for t in conf.sample_targets]}, output={"json": conf.json_prefix + "_frip.json"}, param={"samples": conf.sample_bases})) stat.allow_fail = True stat.allow_dangling = True
def stat_bedAnnotate(workflow, conf, has_dhs, has_velcro): """ Describe peaks' distribution # collect meta gene distribution info """ collect_meta2 = attach_back( workflow, PythonCommand(json_meta2, input={"meta": conf.prefix + ".meta"}, output={"json": conf.json_prefix + "_meta.json"}, param={"id": conf.id}, name="bedAnnotate summary")) collect_meta2.allow_fail = True collect_meta2.allow_dangling = True if has_dhs: collect_dhs = attach_back( workflow, PythonCommand(json_dhs, input={ "dhs": conf.prefix + ".dhs", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_dhs.json"}, name="DHS summary")) collect_dhs.allow_dangling = True collect_dhs.allow_fail = True if has_velcro: collect_velcro = attach_back( workflow, PythonCommand(json_velcro, input={ "velcro": conf.prefix + ".velcro", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_velcro.json"}, name="Velcro summary")) collect_velcro.allow_fail = True collect_velcro.allow_dangling = True
def stat_ceas(workflow, conf, has_dhs, has_velcro): # collect meta gene distribution info """ Describe peaks' distribution ########################################################################### DEPRECATED!!!!--see stat_bedAnnotate below ########################################################################### """ attach_back( workflow, PythonCommand(json_meta, input={ "meta": conf.prefix + ".meta", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_meta.json"}, param={"id": conf.id}, name="DHS summary")) if has_dhs: attach_back( workflow, PythonCommand(json_dhs, input={ "dhs": conf.prefix + ".dhs", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_dhs.json"}, name="DHS summary")) if has_velcro: attach_back( workflow, PythonCommand(json_velcro, input={ "velcro": conf.prefix + ".velcro", "top_peaks": 5000 }, output={"json": conf.json_prefix + "_velcro.json"}, name="Velcro summary"))
def stat_conservation(workflow, conf): collect = attach_back(workflow, PythonCommand( json_conservation, input={"score": conf.prefix + "_conserv.txt"}, output={"json": conf.json_prefix + "_conserv.json"}, param={"atype": conf.get("basics", "factor", "TF"), "id": conf.id}, name = "conservation score")) collect.allow_dangling = True collect.allow_fail = True if conf.long: ## cluster figures, obsolete, keep for compatible fig = attach_back(workflow, PythonCommand(conservation_figures, input ={"conservationR": conf.prefix + "_conserv.R", "historical_conservation_cluster_text": resource_filename("chilin2.modules.dbaccessor", "Histone_centers.txt")}, output = {"R": conf.prefix+"_conserv_cluster.R", "compare_pdf": conf.prefix + "_conserv_compare.pdf"}, param = {"id": conf.id})) fig.allow_fail = True fig.allow_dangling = True
def stat_motif(workflow, conf): collect = attach_back( workflow, PythonCommand( stat_seqpos, input={"seqpos": conf.prefix + "_seqpos/" + "motif_list.json"}, output={"json": conf.json_prefix + "_seqpos.json"}, param={ "prefix": conf.prefix + "_seqpos/seqLogo/", "z_score_cutoff": -1 }, name="collect motif info")) collect.allow_fail = True collect.allow_dangling = True
def tex_frip(workflow, conf): tex = attach_back( workflow, PythonCommand(load_latex, input={ "json": conf.json_prefix + "_frip.json", "template": resource_filename("chilin2.modules.frip", "frip.tex"), }, output={"latex": conf.latex_prefix + "_frip.tex"})) tex.allow_dangling = True tex.allow_fail = True
def stat_bwa(workflow, conf): ## use samtools to parse mappable reads from bwa """ bam files are filtered by samtools -q 1, so mapped reads are considered to be unique """ for t in conf.sample_targets: stat = attach_back(workflow, ShellCommand( """ {tool} view -Sc {input[sam]} > {output[total]} {tool} flagstat {input[bam]} > {output[stat]} """, tool = "samtools", input = {"bam": t + ".bam", "sam": t + ".sam"}, output = {"stat": t + "_mapped.bwa", "total": t + "_total.bwa"})) stat.allow_fail = True stat.allow_dangling = True collect = attach_back(workflow, PythonCommand(json_bwa, input={"bwa_mapped": [ t + "_mapped.bwa" for t in conf.sample_targets ], "bwa_total": [ t + "_total.bwa" for t in conf.sample_targets ]}, output={"json": conf.json_prefix+"_map.json"}, param={"sample":conf.sample_bases}, name="bwa qc")) collect.allow_dangling = True collect.allow_fail = True if conf.long: long_collect = attach_back(workflow, PythonCommand(bwa_figures, input = {"dbaccessor": resource_filename("chilin2.modules.dbaccessor", "ChiLinQC.db"), "json": conf.json_prefix + "_map.json", "template": resource_filename("chilin2.modules.summary", "R_culmulative_plot.R")}, output = {"pdf": conf.prefix + "_bwa_compare.pdf", "R": conf.prefix+"_bwa_compare.R"}, param = {"sample": conf.sample_bases})) long_collect.allow_fail = True long_collect.allow_fail = True
def stat_macs2_on_rep(workflow, conf): if conf.get("macs2", "type") in ["both", "narrow"]: xls = [t + "_peaks.xls" for t in conf.treatment_targets] else: xls = [t + "_b_peaks.xls" for t in conf.treatment_targets] if len(conf.treatment_targets) > 1: stat = attach_back( workflow, PythonCommand( json_macs2_on_reps, input={"all_peak_xls": xls}, output={"json": conf.json_prefix + "_macs2_rep.json"}, param={"samples": conf.treatment_bases})) stat.allow_fail = True stat.allow_dangling = True
def tex_motif(workflow, conf): tex = attach_back( workflow, PythonCommand(latex_seqpos, input={ "template": resource_filename("chilin2.modules", "mdseqpos/motif.tex"), "json": conf.json_prefix + "_seqpos.json" }, output={"tex": conf.latex_prefix + "_motif.tex"}, param={"id": conf.id}, name="generating latex of motif info")) tex.allow_fail = True tex.allow_dangling = True
def fragment(workflow, conf): ## this is done after FRiP if conf.get("tool", "macs2"): macs2_bin = conf.get("tool", "macs2") else: macs2_bin = "macs2" for target in conf.treatment_targets: fragment_size = attach_back( workflow, ShellCommand( "{tool} predictd -i {input[bam]} --rfile {param[prefix]} -g {param[species]}", tool=macs2_bin, input={"bam": target + ".bam"}, output={"R": target + "_model.R"}, param={ "prefix": target + "_model.R", "species": 'hs' })) fragment_size.update(param=conf.items("macs2")) ## except too few peaks for modeling fragment_size.allow_fail = True fragment_size.allow_dangling = True ## extract standard deviation from MACS2 model.R, ## use m, p, and pileup value for standard deviation; mean fragment size is provided (choose the one with highest correlation) frag_qc = attach_back( workflow, PythonCommand( stat_frag_std, input={ "r": [target + "_model.R" for target in conf.treatment_targets] }, output={ "json": conf.json_prefix + "_frag.json", "r": [target + "_frag_sd.R" for target in conf.treatment_targets] }, param={ "samples": conf.treatment_bases, "frag_tool": "BAMSE" }, name="macs2 model R script parser")) frag_qc.allow_fail = True frag_qc.allow_dangling = True
def tex_phan(workflow, conf): figures = [] for t in conf.sample_targets: if conf.down: figures.append(t + "_4000000.pdf") else: figures.append(t + ".pdf") attach_back( workflow, PythonCommand(long_tex, input={ "template": resource_filename("chilin2.modules.phantompeak", "phan.tex"), "figure": figures }, output={"latex": conf.latex_prefix + "_phan.tex"}))
def stat_macs2(workflow, conf): # collect peaks """ merged peaks and replicates peaks high confident peaks duplicates level """ xls = conf.prefix + "_peaks.xls" if conf.get("macs2", "type") in [ "both", "narrow" ] else conf.prefix + "_b_peaks.xls" stat = attach_back( workflow, PythonCommand(json_macs2, input={"macs2_peaks_xls": xls}, output={"json": conf.json_prefix + "_macs2.json"}, param={"id": conf.id})) stat.allow_fail = True stat.allow_dangling = True
def tex_contamination(workflow, conf): all_species = [i for i, _ in conf.items("contamination")] tex = attach_back( workflow, PythonCommand(latex_contamination, input={ "template": resource_filename("chilin2.modules", "contamination/contamination.tex"), "json": conf.json_prefix + "_contam.json" }, output={"latex": conf.latex_prefix + "_contam.tex"}, param={ 'id': conf.id, 'layout': 'c' * (len(all_species) + 1) })) tex.allow_dangling = True tex.allow_fail = True
def stat_contamination(workflow, conf): all_species = [i for i, _ in conf.items("contamination")] summ = [] for target in conf.sample_targets: summ.append([(target + species + "_mapped." + conf.mapper, target + species + "_total." + conf.mapper) for species in all_species]) collect = attach_back( workflow, PythonCommand(json_contamination, input={"summaries": summ}, output={"json": conf.json_prefix + "_contam.json"}, param={ "samples": conf.sample_bases, "id": conf.id, "species": all_species }, name="stat contamination")) collect.allow_dangling = True collect.allow_fail = True
def summary_table_latex(workflow, conf): n = len(conf.sample_bases) width = 1 / float(n + 1) - 0.05 summary_tab = attach_back( workflow, PythonCommand( latex_summary_table, input={ "template": resource_filename("chilin2.modules.summary", "summary_table.tex") }, output={"latex": conf.latex_prefix + "_summary_table.tex"}, param={ "conf": conf, "layout": "m{%s\\linewidth}" % (width) + ">{\\centering\\arraybackslash}m{%s\\linewidth}" % (width) * (len(conf.sample_bases)) })) summary_tab.allow_fail = True summary_tab.allow_dangling = True
def stat_replicates(workflow, conf): ## replicates peaks and bigwiggle """ input:wigCorrelate of multiple replicates results replicates peaks overlap number(percentage: 0.3) output: *replicates.json """ stat = attach_back( workflow, PythonCommand(json_reps, input={ "cor": conf.prefix + ".cor", "overlap": [ conf.prefix + "_%s_%s.overlap" % (i, j) for i in range(len(conf.treatment_targets)) for j in range(i + 1, len(conf.treatment_targets)) ] }, output={"json": conf.json_prefix + "_rep.json"}, param={"param": conf.id})) stat.allow_fail = True stat.allow_dangling = True
def write_conf(workflow, conf): #save the conf file attach_back(workflow, PythonCommand(WriteConf, output=conf.prefix + ".conf", param={"conf": conf} ))
def read_enrichment_on_meta(workflow, conf): """ total reads enrichment in exon, promoter and union DHS regions """ try: has_dhs = conf.get(conf.get("basics", "species"), "dhs") except: has_dhs = "" import os for t in conf.sample_targets: enrich = attach_back( workflow, ShellCommand( """ exon=$(bedtools intersect -f {param[p]} -wa -u -abam {input[bam]} -b {param[exon]} -bed | wc -l) promoter=$(bedtools intersect -f {param[p]} -wa -u -abam {input[bam]} -b {param[promoter]} -bed | wc -l) total=$(samtools flagstat {input[bam]} | head -1 | cut -d" " -f1) echo $exon,$promoter,$total > {output[meta]} """, tool="coverageBed", input={"bam": t + "_4000000.bam" if conf.down else t + ".bam"}, output={"meta": t + ".enrich.meta"}, param={ "promoter": os.path.join(conf.target_dir, "gene.bed_promoter"), "p": "1E-9", "exon": os.path.join(conf.target_dir, "gene.bed_exon") })) enrich.allow_dangling = True enrich.allow_fail = True if has_dhs: dhs = attach_back( workflow, ShellCommand( """ dhs=$(bedtools intersect -f {param[p]} -wa -u -abam {input[bam]} -b {param[dhs]} -bed | wc -l) total=$(samtools flagstat {input[bam]} | head -1 | cut -d" " -f1) echo $dhs,$total > {output[dhs]} """, tool="coverageBed", input={ "bam": t + "_4000000.bam" if conf.down else t + ".bam", "dhs": conf.get_path(conf.get("basics", "species"), "dhs") }, output={"dhs": t + ".enrich.dhs"}, param={ "p": "1E-9", "dhs": conf.get_path(conf.get("basics", "species"), "dhs") }, )) dhs.allow_fail = True dhs.allow_dangling = True em = attach_back( workflow, PythonCommand( enrich_in_meta, input={ "meta": [t + ".enrich.meta" for t in conf.sample_targets], "mapped": [t + "_mapped.bwa" for t in conf.sample_targets] }, ## use 4M reads for down sampling ones, and all reads instead output={"json": conf.json_prefix + "_enrich_meta.json"}, param={ "samples": conf.sample_bases, "id": conf.id, "has_dhs": has_dhs, "down": conf.down, "dhs": [t + ".enrich.dhs" for t in conf.sample_targets] })) em.allow_fail = True em.allow_dangling = True