R(""" library(ggplot2) library(scales) library(gridExtra) data = tryCatch({{return(read.table("%(infile)s", header=F))}}, error=function(e) {{return(data.frame())}}) if (dim(data)[1] == 0) {{ pdf("%(outfile)s") plot(0:10, type = "n", xaxt="n", yaxt="n", bty="n", xlab = "", ylab = "") text(5, 8, "No data", cex=2) dev.off() }} else {{ colnames(data) = c("seqId", "len") plotList = list() plotList[[1]] = ggplot(data, aes(x=len)) + geom_histogram(binwidth=1, color="black") + ggtitle(expression(atop("Length distribution of predicted amplicons", atop("with primer pair: %(primers)s on database: %(db)s", atop("min. Temp.: %(minT).2f, max. Temp.: %(maxT).2f, max. Len.: %(len)i"))))) + xlab("length") plotList[[2]] = ggplot(data, aes(x=len)) + geom_density() + ggtitle(expression(atop("Length distribution of predicted amplicons", atop("with primer pair: %(primers)s on database: %(db)s", atop("min. Temp.: %(minT).2f, max. Temp.: %(maxT).2f, max. Len.: %(len)i"))))) + xlab("length") pdf("%(outfile)s") print(plotList) dev.off() }} """ % {"infile": input[0], "outfile": output[0], "primers": wildcards.pair_name, "db": wildcards.dbname, "len": config["primer_pairs"][wildcards.pair_name]["max_len"], "maxT":config["primer_pairs"][wildcards.pair_name]["max_temp"], "minT": config["primer_pairs"][wildcards.pair_name]["min_temp"], } )
R(""" library(SKAT) library(stringr) library(plotrix) assoc <- read.delim('{input.assoc}', sep=" ") n <- min(nrow(assoc),16) if (n > 0) {{ for (i in 1:n) {{ row <- assoc[i, ] ssd = paste('ssd/{wildcards.dataset}.chr',row$CHR,'.SSD', sep="") fam = paste('bed/{wildcards.dataset}.chr',row$CHR,'.fam', sep="") info = paste('ssd/{wildcards.dataset}.chr',row$CHR,'.Info', sep="") pheno <- '{input.pheno}' FAM<- Read_Plink_FAM_Cov(fam, pheno, Is.binary=FALSE) y <- FAM${wildcards.pheno} SSD.INFO = Open_SSD(File.SSD=ssd, File.Info=info) SetIndex <- which(SSD.INFO$SetInfo$SetID == row$SetID) genos <- Get_Genotypes_SSD(SSD.INFO, SetIndex, TRUE) genos <- as.data.frame(genos) genos${wildcards.pheno} <- y genos$haplotypes <- do.call(paste0, genos[colnames(genos)[1:length(colnames(genos))-1]]) genos$haplotypes <- as.factor(genos$haplotypes) png(paste('{output}',str_pad(i, 2, pad="0"),'_',row$SetID,'.png',sep="")) boxplot(genos${wildcards.pheno} ~ genos$haplotypes, main=paste('{wildcards.dataset}',' chr', row$CHR, sep=""), ylab='{wildcards.pheno}', xlab=row$SetID) colors <- ifelse(FAM$braak > 3, 'red', 'green') points(genos${wildcards.pheno} ~ genos$haplotypes, col=colors) mtext(paste(colnames(genos)[1:max(1, ncol(genos)-2)], collapse=" "),side=3,outer=F) # count genotypes f <- summary(genos$haplotypes) f <- as.data.frame(f) f <- cbind(rownames(f),f$f) colnames(f) <- c('Hap','n') f <- apply(f, 2, str_pad, width=nchar(f[1,1]), pad=" ") addtable2plot(x="bottomright",table=f) dev.off() }} }} """)
workdir: "workflow4" from snakemake.utils import R SAMPLES = ["GSM521934", "GSM521935"] rule all: input: expand("{sample}_sorted.bam", sample = SAMPLES) rule sam_to_bam: input: "{file}.sam" output: "{file}.bam" params: threads = 2 log: "{file}.log" benchmark: "{file}.json" shell: "(samtools view -bS --threads {params.threads} {input} > {output}) > {log}" rule bam_sorted: input: "{file}.bam" output: "{file}_sorted.bam" run: R(""" library(Rsamtools) library(tools) sortBam("{input}", "{output}") file.rename("{output}.bam", file_path_sans_ext("{output}.bam")) """)
def boxplot(input, output, x, y, x_log_scale_base=None, y_log_scale_base=None, remove_na=False, remove_outlier=False): command = """ library(ggplot2) library(scales) d <- read.delim("{input}", header=T) """ if remove_na: command += """ d <- d[complete.cases(d),] """ command += """ g <- ggplot(d, aes(x={x}, y={y}, group={x})) """ if remove_outlier: command += """ g <- g + geom_boxplot(outlier.shape = NA) ylim1 <- boxplot.stats(d${y})$stats[c(1, 5)] # g <- g + geom_point() """ else: command += """ g <- g + geom_boxplot() # g <- g + geom_point() """ # if ((x_log_scale_base=="10") and (y_log_scale_base=="10")): # command += """ # g <- g + coord_trans(x = "log10", y = "log10") # """ if x_log_scale_base is not None: command += """ g <- g + scale_x_continuous( trans = 'log{x_log_scale_base}', breaks = pretty_breaks() # labels = trans_format('log{x_log_scale_base}', math_format({x_log_scale_base}^.x)) ) """ if y_log_scale_base is not None: command += """ g <- g + scale_y_continuous( trans = 'log{y_log_scale_base}', breaks = pretty_breaks() # labels = trans_format('log{y_log_scale_base}', math_format({y_log_scale_base}^.x)) ) """ if remove_outlier: command += """ g <- g + coord_cartesian(ylim = ylim1*1.05) """ # command += """ # ggsave(file="{output}", plot=g) # """ command += """ pwd <- getwd() setwd(dirname("{output}")) ggsave(file=basename("{output}"), plot=g) setwd(pwd) """ command = command.format(input=input, output=output, x=x, y=y, x_log_scale_base=x_log_scale_base, y_log_scale_base=y_log_scale_base) print(command) R(command)
def summmarized_plot(input, output, x, y1, y2, y1_op="sd", y2_op="mean", x_log_scale_base=None, y_log_scale_base=None, remove_na=False): command = """ library(ggplot2) library(scales) library(plyr) d <- read.delim("{input}", header=T) """ if remove_na: command += """ d <- d[complete.cases(d),] """ command += """ d_stat <- ddply(d, .({x}), summarize, "{y1_op}_{y1}"={y1_op}({y1}), "{y2_op}_{y2}"={y2_op}({y2})) d1 <- data.frame(d_stat${x}, d_stat${y1_op}_{y1}, "{y1_op}_{y1}") colnames(d1) <- c("{x}", "{y1}", "group") d2 <- data.frame(d_stat${x}, d_stat${y2_op}_{y2}, "{y2_op}_{y2}") colnames(d2) <- c("{x}", "{y1}", "group") d_merged <- rbind(d1, d2) g <- ggplot(d_merged, aes(x={x}, y={y1}, group=group, color=group)) g <- g + geom_point() g <- g + geom_line() """ # if ((x_log_scale_base=="10") and (y_log_scale_base=="10")): # command += """ # g <- g + coord_trans(x = "log10", y = "log10") # """ if x_log_scale_base is not None: command += """ g <- g + scale_x_continuous( trans = 'log{x_log_scale_base}', breaks = pretty_breaks() # breaks = trans_breaks('log{x_log_scale_base}', function(x) {x_log_scale_base}^(x/2)), # labels = trans_format('log{x_log_scale_base}', math_format({x_log_scale_base}^.x)) ) """ if y_log_scale_base is not None: command += """ g <- g + scale_y_continuous( trans = 'log{y_log_scale_base}', breaks = pretty_breaks() # breaks = trans_breaks('log{y_log_scale_base}', function(x) {y_log_scale_base}^(x/2)), # labels = trans_format('log{y_log_scale_base}', math_format({y_log_scale_base}^.x)) ) """ # command += """ # g <- g + ylab(NULL) # g <- g + scale_color_hue(name="", labels=c({y1_op}_{y1}="{y1_op}({y1})", {y2_op}_{y2}="{y2_op}({y2})")) # ggsave(file="{output}", plot=g) # """ command += """ g <- g + ylab(NULL) g <- g + scale_color_hue(name="", labels=c({y1_op}_{y1}="{y1_op}({y1})", {y2_op}_{y2}="{y2_op}({y2})")) """ command += """ pwd <- getwd() setwd(dirname("{output}")) ggsave(file=basename("{output}"), plot=g) setwd(pwd) """ command = command.format(input=input, output=output, x=x, y1=y1, y2=y2, y1_op=y1_op, y2_op=y2_op, x_log_scale_base=x_log_scale_base, y_log_scale_base=y_log_scale_base) print(command) R(command)
R(""" library('reshape') library('ggplot2') library('edgeR') setwd("{DIR}") myfiles=as.character(unlist(strsplit("{input.files}", split=" "))) res=read.delim(myfiles[1],header=T) colnames(res)[1]="gene" colnames(res)[2]=as.character(myfiles[1]) # remove the last 5 statistics lines ... # nr=dim(res)[1] # res=res[-c((nr-4):nr),] # for(i in seq(2, length(myfiles), by = 1)) {{ temp=read.delim(myfiles[i],header=T) colnames(temp)[1]="gene" colnames(temp)[2]=as.character(myfiles[i]) res=merge(res,temp) }} write.table(as.data.frame(res),file="RawCountFile.txt",sep="\t",row.names=F) # mydata=read.delim("RawCountFile.txt",row.names=1) val1=as.numeric("{MINCOUNT}") val2=as.numeric("{MINSAMPLES}") cat(val1," ", val2, "checking..\n",file="check.txt") filter <- apply(mydata, 1, function(x) length(x[x>val1])>=val2) res=mydata[filter,] write.table(as.data.frame(res),file="RawCountFile_filtered.txt",sep="\t",col.names=NA) png("HistBeforenormFilter.png") df.m <- melt(as.data.frame(res)) print(ggplot(df.m) + geom_density(aes(x = value, colour = variable)) + labs(x = NULL) + theme(legend.position='top') + scale_x_log10()) dev.off() y = DGEList(counts=res) ## Normalization TMM ------------------------------------------------------------ ## method = =c("TMM","RLE","upperquartile","none") y <- calcNormFactors(y,method="TMM") ndata= cpm(y,log=FALSE,normalized.lib.sizes=TRUE) ## save it write.table(ndata,file="CPM_TMM_counts.txt",sep="\t",col.names=NA) """)
out.write("\t".join([",".join(seqList) for seqList in listTab.values()])) with open(output.counts, "w") as out: for name, seqList in listTab.items(): out.write("%s\t%i\n" % (name, len(seqList))) with open(output.readOtuMap, "w") as out: for otu, seqList in listTab.items(): for seq in seqList: out.write("%s\t%s\n" % (seq, otu)) rule plotOtuSizeDist: input: "{sample}.full.good.unique.abund.otus.counts.tsv" output: "{sample}.full.good.unique.abund.otus.sizeDist.pdf" run: R("library(ggplot2)\nd=read.table(\"%s\")\npd = as.data.frame(cbind(seq(1,length(d$V2)), d$V2[order(d$V2, decreasing=T)]))\np=ggplot(pd) + geom_segment(aes(x=V1, xend=V1, y=V2, yend=0)) + xlab(\"OTU rank\") + ylab(\"size\") + ggtitle(\"%s - OTU size distribution\")\nggsave(\"%s\", p)" % (input[0], "{wildcards.sample}", output[0])) rule computRarefaction: input: "{sample}.full.good.unique.abund.otus.list" output: "{sample}.full.good.unique.abund.otus.rarefaction" shell: "%(mothur)s \"#rarefaction.single(list={input})\"" % config rule plotRarefaction: input: "{sample}.full.good.unique.abund.otus.rarefaction" output: "{sample}.full.good.unique.abund.otus.rarefaction.pdf" run: R("""library(ggplot2) d=read.table("%s", header=T) p = ggplot(d, aes(numsampled)) + geom_point(aes(y=X0.03), colour="blue", size=1) + geom_ribbon(aes(ymin=lci, ymax=hci), alpha=0.2) + xlab("number of reads sampled") + ylab("number of OTUs observed") + ggtitle("Rarefaction curve - %s")
rule quantification_with_featureCounts: input: novel="samples/new_annotation/all_transcripts.gtf", bam=expand("samples/bam/{smp}.bam", smp=SAMPLES) output: "results/counts/gene_counts.txt", "results/counts/gene_counts_mini.txt" shell: """ featureCounts -p -s 2 -T 15 -t exon -g gene_id -a {input.novel} -o {output[0]} {input.bam} &> {output[0]}.log cut -f 1,7- {output[0]}| awk 'NR > 1' | awk '{{gsub("samples/bam/","",$0); print}}' > {output[1]} """ rule diagnostic_plot: input: "results/counts/gene_counts_mini.txt" output: "results/diagnostic_plot/diagnostic.pdf" run: R(""" dir.create("results/diagnostic_plot") data <- read.table("{input}", sep="\t", header=T, row.names=1) data <- data[rowSums(data) > 0, ] data <- log2(data + 1) pdf("{output}") dev.null <- apply(data, 2, hist, border="white", col="blue") boxplot(data, color="blue", pch=16) pairs(data, pch=".", col="blue") dev.off() cat("etc...") """)
rule limit_for_plot: input: WORK + '{method}.eval_{dat}.{cols}.eval' output: WORK + '{method}.eval_{dat}.{cols}.totWrong' shell: "grep 'TotWrong\|count' {input} | grep -v ssue | grep -v earing > {output}" rule cat: input: expand( WORK + '{{method}}.eval_{dat}.{{cols}}.totWrong', dat=('clinvar', 'denovo', 'clinvar_mult', 'clinvar_single', 'clinvar_exp') ) output: o = WORK + 'totWrong/{method}.{cols}' run: pd.concat( [pd.read_csv(x, sep='\t') for x in list(input)] ).to_csv(output.o, index=False, sep='\t') rule plot: input: WORK + 'totWrong/{method}.{cols}' output: DOCS + 'plot/{method}.other.{cols}.totWrong.png' run: R(""" require(ggplot2) d = read.delim("{input}", sep='\t', header=TRUE) p = ggplot(data=d) + geom_col(aes(y=var_count,x=score_type, fill=score_type)) + facet_grid(clinvar_type~., scale='free') + theme_bw() + ylab('Wrong Predictions') + theme(axis.text.x = element_text(angle=45, hjust=1)) + xlab('') + theme(legend.position="none") ggsave("{output}", p) """) rule all_eval: input: expand( DOCS + 'plot/{method}.other.{cols}.totWrong.png', method=('global',), cols=('mpc', 'revel', 'mpc-revel', 'ccr', 'mpc-revel-ccr', 'mpc-ccr', 'revel-ccr') )
out.write("%s\t%i\tssu\t%s\t%i\n" % (oId, size, "\t".join([ssuCls[r] for r in ranks]), ssuDepth)) out.write("%s\t%i\t**s\t%s\t%i\n" % (oId, size, "\t".join([itsCls[r] for r in ranks]), itsDepth)) out.write("%s\t%i\tlsu\t%s\t%i\n" % (oId, size, "\t".join([lsuCls[r] for r in ranks]), lsuDepth)) rule clsSummary: """Collect some summary stats of how many OTUs were assigend to taxonomic ranks for the paper abstract""" input: "taxonomy/{sampleSet}_97_comb.stats.tsv" output: "taxonomy/{sampleSet}_97_clsStats.tsv" run: R(""" d=read.table("{input}", sep="\t") colnames(d) = c("oId", "size", "mrk", "domain", "kingdom", "phylum", "class", "order", "family", "genus", "species", "depth") a=aggregate(depth ~ oId, subset(d, size>1), max) ranks=c("kingdom", "phylum", "class", "order", "family", "genus", "species") s = data.frame(rank=numeric(0), nubmer=numeric(0)) for (i in 1:7) {{ s = rbind(s, data.frame(rank=ranks[i], number=sum(a$depth>=i))) }} write.table(s, "{output}", sep="\t", row.names=F) """) rule plotClsComp: """Create plots of classifications depth""" input: all="taxonomy/{sampleSet}_97_comb.stats.tsv" output: depth="{sampleSet}_clsComp_depth.svg", depthFungi="{sampleSet}_clsComp_depth_fungi.svg", block="{sampleSet}_clsComp_basic.svg" run: R(""" library(reshape2) library(ggplot2)