Python R示例，snakemake.utils.R Python示例

示例#1

0

显示文件

文件： inSilicoPcr_snakefile.py 项目： f-heeger/bioinfomatics-python-scripts

        R("""
        library(ggplot2)
        library(scales)
        library(gridExtra)

        data =  tryCatch({{return(read.table("%(infile)s",  header=F))}}, error=function(e) {{return(data.frame())}})
        
        if (dim(data)[1] == 0) {{
            pdf("%(outfile)s")
            plot(0:10, type = "n", xaxt="n", yaxt="n", bty="n", xlab = "", ylab = "")
            text(5, 8, "No data", cex=2)
            dev.off()
        }} else {{
        
            colnames(data) = c("seqId", "len")

            plotList = list()
            plotList[[1]] = ggplot(data, aes(x=len)) + geom_histogram(binwidth=1, color="black") + ggtitle(expression(atop("Length distribution of predicted amplicons", atop("with primer pair: %(primers)s on database: %(db)s", atop("min. Temp.: %(minT).2f, max. Temp.: %(maxT).2f, max. Len.: %(len)i"))))) + xlab("length")
            plotList[[2]] = ggplot(data, aes(x=len)) + geom_density() + ggtitle(expression(atop("Length distribution of predicted amplicons", atop("with primer pair: %(primers)s on database: %(db)s", atop("min. Temp.: %(minT).2f, max. Temp.: %(maxT).2f, max. Len.: %(len)i"))))) + xlab("length")
            pdf("%(outfile)s")
            print(plotList)
            dev.off()
        }}
        """ % {"infile": input[0], "outfile": output[0], 
               "primers": wildcards.pair_name, 
               "db": wildcards.dbname, 
               "len": config["primer_pairs"][wildcards.pair_name]["max_len"], 
               "maxT":config["primer_pairs"][wildcards.pair_name]["max_temp"],
               "minT": config["primer_pairs"][wildcards.pair_name]["min_temp"],
               }
         )

示例#2

0

显示文件

文件： Snakefile.skat.py 项目： crs/skat-pipeline

		R("""
		
		library(SKAT)
		library(stringr)
		library(plotrix)
		
		assoc <- read.delim('{input.assoc}', sep=" ")
		
		n <- min(nrow(assoc),16)
		
		if (n > 0) {{
		for (i in 1:n) {{
		
			row <- assoc[i, ]
			ssd = paste('ssd/{wildcards.dataset}.chr',row$CHR,'.SSD', sep="")
			fam = paste('bed/{wildcards.dataset}.chr',row$CHR,'.fam', sep="")
			info = paste('ssd/{wildcards.dataset}.chr',row$CHR,'.Info', sep="")
			pheno <- '{input.pheno}'
			
			FAM<- Read_Plink_FAM_Cov(fam, pheno, Is.binary=FALSE)
			y <- FAM${wildcards.pheno}			
			
			SSD.INFO = Open_SSD(File.SSD=ssd, File.Info=info)

			SetIndex <- which(SSD.INFO$SetInfo$SetID == row$SetID)
			
			genos <- Get_Genotypes_SSD(SSD.INFO, SetIndex, TRUE)
			
			genos <- as.data.frame(genos)
			
			genos${wildcards.pheno} <- y
			
			genos$haplotypes <- do.call(paste0, genos[colnames(genos)[1:length(colnames(genos))-1]])
			
			genos$haplotypes <- as.factor(genos$haplotypes)

			png(paste('{output}',str_pad(i, 2, pad="0"),'_',row$SetID,'.png',sep=""))
			boxplot(genos${wildcards.pheno} ~ genos$haplotypes, main=paste('{wildcards.dataset}',' chr', row$CHR, sep=""), ylab='{wildcards.pheno}', xlab=row$SetID) 
			colors <- ifelse(FAM$braak > 3, 'red', 'green')
			points(genos${wildcards.pheno} ~ genos$haplotypes, col=colors)
			mtext(paste(colnames(genos)[1:max(1, ncol(genos)-2)], collapse=" "),side=3,outer=F) 
			
			
			# count genotypes
			f <- summary(genos$haplotypes)
			f <- as.data.frame(f)
			f <- cbind(rownames(f),f$f)
			colnames(f) <- c('Hap','n')
			f <- apply(f, 2, str_pad, width=nchar(f[1,1]), pad=" ")
			addtable2plot(x="bottomright",table=f)
			
			dev.off()
		}}
		
		}}
		
		
		""")

示例#3

0

显示文件

文件： workflow4.py 项目： Alexenhancer/gene-regulation

workdir: "workflow4"

from snakemake.utils import R

SAMPLES = ["GSM521934", "GSM521935"]

rule all:
    input: expand("{sample}_sorted.bam", sample = SAMPLES)

rule sam_to_bam:
    input: "{file}.sam"
    output: "{file}.bam"
    params: threads = 2
    log: "{file}.log"
    benchmark: "{file}.json"
    shell: "(samtools view -bS --threads {params.threads} {input} > {output}) > {log}"

rule bam_sorted:
    input: "{file}.bam"
    output: "{file}_sorted.bam"
    run:
        R("""
        library(Rsamtools)
        library(tools)

        sortBam("{input}", "{output}")
        file.rename("{output}.bam", file_path_sans_ext("{output}.bam"))
        """)

示例#4

0

显示文件

def boxplot(input, output, x, y, x_log_scale_base=None, y_log_scale_base=None, remove_na=False, remove_outlier=False):
    command = """
    library(ggplot2)
    library(scales)
    d <- read.delim("{input}", header=T)
    """
    if remove_na:
        command += """
        d <- d[complete.cases(d),]
        """
    command += """
    g <- ggplot(d, aes(x={x}, y={y}, group={x}))
    """
    if remove_outlier:
        command += """
        g <- g + geom_boxplot(outlier.shape = NA)
        ylim1 <- boxplot.stats(d${y})$stats[c(1, 5)]
        # g <- g + geom_point()
        """
    else:
        command += """
        g <- g + geom_boxplot()
        # g <- g + geom_point()
        """

    # if ((x_log_scale_base=="10") and (y_log_scale_base=="10")):
    #     command += """
    #     g <- g + coord_trans(x = "log10", y = "log10")
    #     """
    if x_log_scale_base is not None:
        command += """
        g <- g + scale_x_continuous(
            trans = 'log{x_log_scale_base}',
            breaks = pretty_breaks()
            # labels = trans_format('log{x_log_scale_base}', math_format({x_log_scale_base}^.x))
            )
        """
    if y_log_scale_base is not None:
        command += """
        g <- g + scale_y_continuous(
            trans = 'log{y_log_scale_base}',
            breaks = pretty_breaks()
            # labels = trans_format('log{y_log_scale_base}', math_format({y_log_scale_base}^.x))
            )
        """

    if remove_outlier:
        command += """
        g <- g + coord_cartesian(ylim = ylim1*1.05)
        """
    
    # command += """
    # ggsave(file="{output}", plot=g)
    # """

    command += """
    pwd <- getwd()
    setwd(dirname("{output}"))
    ggsave(file=basename("{output}"), plot=g)
    setwd(pwd)
    """

    command = command.format(input=input, output=output, x=x, y=y, x_log_scale_base=x_log_scale_base, y_log_scale_base=y_log_scale_base)
    print(command)
    R(command)

示例#5

0

显示文件

def summmarized_plot(input, output, x, y1, y2, y1_op="sd", y2_op="mean", x_log_scale_base=None, y_log_scale_base=None, remove_na=False):
    command = """
    library(ggplot2)
    library(scales)
    library(plyr)
    d <- read.delim("{input}", header=T)
    """
    if remove_na:
        command += """
        d <- d[complete.cases(d),]
        """
    command += """
    d_stat <- ddply(d, .({x}), summarize, "{y1_op}_{y1}"={y1_op}({y1}), "{y2_op}_{y2}"={y2_op}({y2}))
    d1 <- data.frame(d_stat${x}, d_stat${y1_op}_{y1}, "{y1_op}_{y1}")
    colnames(d1) <- c("{x}", "{y1}", "group")
    d2 <- data.frame(d_stat${x}, d_stat${y2_op}_{y2}, "{y2_op}_{y2}")
    colnames(d2) <- c("{x}", "{y1}", "group")
    d_merged <- rbind(d1, d2)
    g <- ggplot(d_merged, aes(x={x}, y={y1}, group=group, color=group))
    g <- g + geom_point()
    g <- g + geom_line()
    """
    # if ((x_log_scale_base=="10") and (y_log_scale_base=="10")):
    #     command += """
    #     g <- g + coord_trans(x = "log10", y = "log10")
    #     """
    if x_log_scale_base is not None:
        command += """
        g <- g + scale_x_continuous(
            trans = 'log{x_log_scale_base}',
            breaks = pretty_breaks()
            # breaks = trans_breaks('log{x_log_scale_base}', function(x) {x_log_scale_base}^(x/2)),
            # labels = trans_format('log{x_log_scale_base}', math_format({x_log_scale_base}^.x))
            )
        """
    if y_log_scale_base is not None:
        command += """
        g <- g + scale_y_continuous(
            trans = 'log{y_log_scale_base}',
            breaks = pretty_breaks()
            # breaks = trans_breaks('log{y_log_scale_base}', function(x) {y_log_scale_base}^(x/2)),
            # labels = trans_format('log{y_log_scale_base}', math_format({y_log_scale_base}^.x))
            )
        """
    # command += """
    # g <- g + ylab(NULL)
    # g <- g + scale_color_hue(name="", labels=c({y1_op}_{y1}="{y1_op}({y1})", {y2_op}_{y2}="{y2_op}({y2})"))
    # ggsave(file="{output}", plot=g)
    # """

    command += """
    g <- g + ylab(NULL)
    g <- g + scale_color_hue(name="", labels=c({y1_op}_{y1}="{y1_op}({y1})", {y2_op}_{y2}="{y2_op}({y2})"))
    """

    command += """
    pwd <- getwd()
    setwd(dirname("{output}"))
    ggsave(file=basename("{output}"), plot=g)
    setwd(pwd)
    """

    command = command.format(input=input, output=output, x=x, y1=y1, y2=y2, y1_op=y1_op, y2_op=y2_op, x_log_scale_base=x_log_scale_base, y_log_scale_base=y_log_scale_base)
    print(command)
    R(command)

示例#6

0

显示文件

文件： rnaseq_wf_v2.py 项目： snijesh/RNASeqPipeline

 R("""
 library('reshape') 
 library('ggplot2')
 library('edgeR')
 setwd("{DIR}")
 myfiles=as.character(unlist(strsplit("{input.files}", split=" ")))
 res=read.delim(myfiles[1],header=T)
 colnames(res)[1]="gene"
 colnames(res)[2]=as.character(myfiles[1]) 
 # remove the last 5 statistics lines ... 
 # nr=dim(res)[1]
 # res=res[-c((nr-4):nr),]
 #
 for(i in seq(2, length(myfiles), by = 1))
 {{
 temp=read.delim(myfiles[i],header=T)
 colnames(temp)[1]="gene"
 colnames(temp)[2]=as.character(myfiles[i]) 
 res=merge(res,temp)
 }}
 write.table(as.data.frame(res),file="RawCountFile.txt",sep="\t",row.names=F) 
 #
 mydata=read.delim("RawCountFile.txt",row.names=1)
 val1=as.numeric("{MINCOUNT}")
 val2=as.numeric("{MINSAMPLES}")
 cat(val1," ", val2, "checking..\n",file="check.txt")
 filter <- apply(mydata, 1, function(x) length(x[x>val1])>=val2)
 res=mydata[filter,]
 write.table(as.data.frame(res),file="RawCountFile_filtered.txt",sep="\t",col.names=NA)
 png("HistBeforenormFilter.png")
 df.m <- melt(as.data.frame(res))
 print(ggplot(df.m) + geom_density(aes(x = value, colour = variable)) + labs(x = NULL) + theme(legend.position='top') + scale_x_log10())
 dev.off() 
 y = DGEList(counts=res)
 ## Normalization TMM ------------------------------------------------------------
 ## method = =c("TMM","RLE","upperquartile","none")
 y <- calcNormFactors(y,method="TMM")
 ndata= cpm(y,log=FALSE,normalized.lib.sizes=TRUE)
 ## save it 
 write.table(ndata,file="CPM_TMM_counts.txt",sep="\t",col.names=NA)
 """)

示例#7

0

显示文件

            out.write("\t".join([",".join(seqList) for seqList in listTab.values()]))
            
        with open(output.counts, "w") as out:
            for name, seqList in listTab.items():
                out.write("%s\t%i\n" % (name, len(seqList)))
                
        with open(output.readOtuMap, "w") as out:
            for otu, seqList in listTab.items():
                for seq in seqList:
                    out.write("%s\t%s\n" % (seq, otu))

rule plotOtuSizeDist:
    input: "{sample}.full.good.unique.abund.otus.counts.tsv"
    output: "{sample}.full.good.unique.abund.otus.sizeDist.pdf"
    run:
        R("library(ggplot2)\nd=read.table(\"%s\")\npd = as.data.frame(cbind(seq(1,length(d$V2)), d$V2[order(d$V2, decreasing=T)]))\np=ggplot(pd) + geom_segment(aes(x=V1, xend=V1, y=V2, yend=0)) + xlab(\"OTU rank\") + ylab(\"size\") + ggtitle(\"%s - OTU size distribution\")\nggsave(\"%s\", p)" % (input[0], "{wildcards.sample}", output[0]))

rule computRarefaction:
    input: "{sample}.full.good.unique.abund.otus.list"
    output: "{sample}.full.good.unique.abund.otus.rarefaction"
    shell:
        "%(mothur)s \"#rarefaction.single(list={input})\"" % config
        
rule plotRarefaction:
    input: "{sample}.full.good.unique.abund.otus.rarefaction"
    output: "{sample}.full.good.unique.abund.otus.rarefaction.pdf"
    run:
        R("""library(ggplot2)
d=read.table("%s", header=T)

p = ggplot(d, aes(numsampled)) + geom_point(aes(y=X0.03), colour="blue", size=1) + geom_ribbon(aes(ymin=lci, ymax=hci), alpha=0.2) + xlab("number of reads sampled") + ylab("number of OTUs observed") + ggtitle("Rarefaction curve - %s")

示例#8

0

显示文件

rule quantification_with_featureCounts:
        input: novel="samples/new_annotation/all_transcripts.gtf", bam=expand("samples/bam/{smp}.bam", smp=SAMPLES)
        output: "results/counts/gene_counts.txt",  "results/counts/gene_counts_mini.txt"
        shell: """
        featureCounts -p -s 2 -T 15 -t exon -g gene_id -a {input.novel} -o {output[0]} {input.bam} &> {output[0]}.log
        cut -f 1,7- {output[0]}| awk 'NR > 1' | awk '{{gsub("samples/bam/","",$0); print}}'  > {output[1]}
        """

rule diagnostic_plot:
        input: "results/counts/gene_counts_mini.txt"
        output: "results/diagnostic_plot/diagnostic.pdf"
        run: R("""
            dir.create("results/diagnostic_plot")
            data <- read.table("{input}", 
                                sep="\t", 
                                header=T, 
                                row.names=1)
            data <- data[rowSums(data) > 0, ]
            data <- log2(data + 1)
            pdf("{output}")
            dev.null <- apply(data, 2, hist, border="white", col="blue")
            boxplot(data, color="blue", pch=16)
            pairs(data, pch=".", col="blue")
            dev.off()
            cat("etc...")
      
        """)

示例#9

0

显示文件

rule limit_for_plot:
    input:  WORK + '{method}.eval_{dat}.{cols}.eval'
    output: WORK + '{method}.eval_{dat}.{cols}.totWrong'
    shell:  "grep 'TotWrong\|count' {input} | grep -v ssue | grep -v earing > {output}"

rule cat:
    input:  expand( WORK + '{{method}}.eval_{dat}.{{cols}}.totWrong', dat=('clinvar', 'denovo', 'clinvar_mult', 'clinvar_single', 'clinvar_exp') )
    output: o = WORK + 'totWrong/{method}.{cols}'
    run:
        pd.concat( [pd.read_csv(x, sep='\t') for x in list(input)] ).to_csv(output.o, index=False, sep='\t')
        
rule plot:
    input:  WORK + 'totWrong/{method}.{cols}'
    output: DOCS + 'plot/{method}.other.{cols}.totWrong.png'
    run:
        R("""
          require(ggplot2)
          d = read.delim("{input}", sep='\t', header=TRUE)
          p = ggplot(data=d) +
          geom_col(aes(y=var_count,x=score_type, fill=score_type)) +
          facet_grid(clinvar_type~., scale='free') + theme_bw() +
          ylab('Wrong Predictions') +
          theme(axis.text.x = element_text(angle=45, hjust=1)) +
          xlab('') + theme(legend.position="none")
          ggsave("{output}", p)
          """)

rule all_eval:
    input: expand( DOCS + 'plot/{method}.other.{cols}.totWrong.png', method=('global',), cols=('mpc', 'revel', 'mpc-revel', 'ccr', 'mpc-revel-ccr', 'mpc-ccr', 'revel-ccr') )

示例#10

0

显示文件

                out.write("%s\t%i\tssu\t%s\t%i\n" % (oId, size, "\t".join([ssuCls[r] for r in ranks]), ssuDepth))
                out.write("%s\t%i\t**s\t%s\t%i\n" % (oId, size, "\t".join([itsCls[r] for r in ranks]), itsDepth))
                out.write("%s\t%i\tlsu\t%s\t%i\n" % (oId, size, "\t".join([lsuCls[r] for r in ranks]), lsuDepth))

rule clsSummary:
    """Collect some summary stats of how many OTUs were assigend to taxonomic 
    ranks for the paper abstract"""
    input: "taxonomy/{sampleSet}_97_comb.stats.tsv"
    output: "taxonomy/{sampleSet}_97_clsStats.tsv"
    run:
        R("""
        d=read.table("{input}", sep="\t")
        colnames(d) = c("oId", "size", "mrk", "domain", "kingdom", "phylum", "class", "order", "family", "genus", "species", "depth")
        a=aggregate(depth ~ oId, subset(d, size>1), max)
        ranks=c("kingdom", "phylum", "class", "order", "family", "genus", "species")
        s = data.frame(rank=numeric(0), nubmer=numeric(0))
        for (i in 1:7) {{
            s = rbind(s, data.frame(rank=ranks[i], number=sum(a$depth>=i)))
        }}
        write.table(s, "{output}", sep="\t", row.names=F)
        """)

rule plotClsComp:
    """Create plots of classifications depth"""
    input: all="taxonomy/{sampleSet}_97_comb.stats.tsv"
    output: depth="{sampleSet}_clsComp_depth.svg", depthFungi="{sampleSet}_clsComp_depth_fungi.svg", block="{sampleSet}_clsComp_basic.svg"
    run:
        R("""
        library(reshape2)
        library(ggplot2)