def plotFalsePositiveRates(infile, outfile): ''' barplot the false positive rates across taxonomic levels ''' R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) for i in [0, 1]: # specificity outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) # sensitivity outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' % i) R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''') R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''' ) R('''ggsave("%s")''' % outf) P.touch(outfile)
def computeExpressionLevels(infiles, outfiles): '''normalize data using gcrma libary. output a file with the R object and another as human readable table. ''' outfile_r, outfile_table = outfiles R.library("simpleaffy") R.library("gcrma") E.info("reading data") raw_data = R('''raw.data = ReadAffy()''') E.info("normalization") R('''gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )''' % PARAMS) E.info("saving data") R('''save( gcrma.eset, raw.data, file = "%s") ''' % outfile_r) data = R('''as.list(assayData(gcrma.eset))''')['exprs'] probesets, headers = R('''dimnames( assayData(gcrma.eset)$exprs )''') headers = [re.sub(".CEL", "", x) for x in headers] outf = open(outfile_table, "w") outf.write("probeset\t%s\n" % "\t".join(headers)) for probeset, data in zip(probesets, data): outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data)))) outf.close()
def buildPCAVarianceExplained(infile, outfile): ''' output PCA variance explained ''' R('''source("%s/microarray_utils.R")''' % PARAMS.get("rdir")) R('''pc.dat <- runPCA("%s")''' % infile) R('''buildPCAVarianceExplained(pc.dat, "%s")''' % outfile)
def getCorrelations(self, dataframe): ''' Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. ''' # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R('''p.mat <- apply(p.df, 2, as.numeric)''') R('''cor.df <- cor(p.mat)''') r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame
def buildPCAScores(infile, outfile): ''' output PCA scores - mainly for reporting ''' R('''source("%s/microarray_utils.R")''' % PARAMS.get("rdir")) R('''pc.dat <- runPCA("%s")''' % infile) R('''buildPCAScores(pc.dat, "%s")''' % outfile)
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw): x = [] y = [] for triad in stats: for r in stats[triad]: # pair = sorted([(v, k) # for k, v in r[0]['js'].items() if len(k)==2]).pop()[1] x.append(r[0]['js'][pair]) y.append(sum(r[0]['EN'][t] for t in pair)) title = str(len(x)) + ' samples' if output_file: title = output_file + ', ' + title print title globalenv['df'] = qcrop([x], [y]) cmd = 'gg <- ggplot(df, aes(x,y)) + ' + \ 'geom_point(aes(xcrop, ycrop), alpha=0.2) + ' + \ 'stat_smooth(method="loess", color="white", size=1.5, alpha=0.2, se=FALSE) + ' + \ 'stat_smooth(method="loess", color="black") + ' + \ 'xlab("'+' to '.join(pair)+' JSD") + ' + \ 'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + coord_flip()' R(cmd) if output_file: R('ggsave("'+output_file+'", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def nmiConservationFisherTest(infile, outfile): '''Plot heatmap of pairwise scores in R''' scriptsdir = PARAMS["scriptsdir"] R('''source("%(scriptsdir)s/R/proj007/proj007.R")''' % locals()) #print '''nmi_conservation(infile="%(infile)s", outfile="%(outfile)s") ''' % locals() R('''nmi_conservation(infile="%(infile)s", outfile="%(outfile)s") ''' % locals())
def plotMDS(infile, outfile): ''' perform multidimensional scaling of normalised counts ''' outname_matrix = P.snip(outfile, ".pdf") + ".tsv" R('''library(gtools)''') R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) R('''rownames(dat) <- dat$taxa dat <- dat[,1:ncol(dat)-1] dat <- dat[, mixedsort(colnames(dat))] conds <- unlist(strsplit(colnames(dat), ".R[0-9].*"))[seq(1, ncol(dat)*2, 2)] conds <- unlist(strsplit(conds, ".", fixed = T))[seq(2, length(conds)*2, 2)] dat <- as.matrix(t(dat)) dist <- dist(dat) ord1 <- cmdscale(dist) ord2 <- as.data.frame(ord1) ord2$cond <- conds plot1 <- ggplot(ord2, aes(x = V1, y = V2, colour = cond)) plot2 <- plot1 + geom_point(size = 3) cols <- rainbow(length(unique(conds))) plot3 <- plot2 + scale_colour_manual(values = c(cols)) ggsave("%s")''' % outfile)
def plot_JS_EN_scatter_by_pairs(stats, output_file=None, pair=None, **kw): x = [] y = [] ya = [] for triad in stats: for r in stats[triad]: paralinear_dists = get_paralinear_distances(r[0]['gene'], **kw) ns_EN = sum(r[0]['EN'][t] for t in pair) s_EN = sum(r[1]['EN'][t] for t in pair) para = paralinear_dists[pair] if para: x.append(ns_EN) y.append(para) ya.append(s_EN) print 'paralinear stats' print_stats(x, y) print 'GTR stats' print_stats(x, ya) df = DataFrame({'x':FloatVector(x), 'y':FloatVector(y)}) globalenv['df'] = df cmd = 'gg <- ggplot(df, aes(x, y)) + geom_point(alpha=0.2) + ' + \ 'geom_abline(intercept=0, slope=1, color="white") + ' + \ 'xlab(bquote(.("'+' to '.join(pair)+'") ~ d[ENS])) + ' + \ 'ylab(bquote(.("'+' to '.join(pair)+'") ~ d[para])) + ' + \ 'coord_cartesian(xlim=c(0,1), ylim=c(0,1))' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...') return
def get_exons(mart): """Queries a Mart object to find all exons of its dataset attribute. Forms a specific getBM query that is sent to the BioMart API to retrieve information about the exons (and their exonic coordinates) of a specific Dataset. The output is then transformed via the GRanges Bioconductor package and seqnames converted to UCSC standard. Args: mart: an rpy2-converted biomaRt Mart object. Returns: An rpy2 DataFrame containing a table of relevant exon information. DataFrame column headers are: ["seqnames", "start", "end", "width", "strand"] """ exons = R.getBM(attributes = StrVector(("chromosome_name", "exon_chrom_start", "exon_chrom_end", "strand")), mart=mart) exons_ranges = R.GRanges( seqnames=exons.rx2('chromosome_name'), ranges=R.IRanges(start=exons.rx2('exon_chrom_start'), end=exons.rx2('exon_chrom_end')), strand='+' if exons.rx2('strand') == '1L' else '-') # This was hell to find # https://stackoverflow.com/questions/38806898/ set_method = R("`seqlevelsStyle<-`") exons_ranges = set_method(exons_ranges, "UCSC") as_data_frame = R("function(x) as.data.frame(x)") exons_ranges_df = as_data_frame(exons_ranges) return exons_ranges_df
def buildProbe2GeneMap(infile, outfile, PARAMS, platform = "affy"): ''' build file mapping probe id to gene id ''' if platform == "affy": array = PARAMS.get("affy_array") dataset = PARAMS.get("affy_dataset") R('''library("biomaRt")''') R('''library("affy")''') R('''dat <- ReadAffy()''') E.info("getting probes") R('''probes <- featureNames(dat)''') E.info("getting mart") R('''mart <- useMart("ensembl", dataset = "%s")''' % dataset) # matches to hgnc symbol - this might not be appropriate for mouse data... E.info("mapping probes to gene") R('''probe2gene <- getBM(attributes = c("%s", "external_gene_name"), filters = "%s", values = probes, mart = mart)''' % (array, array)) R('''colnames(probe2gene) <- c("probe", "gene")''') R('''probe2gene$gene <- toupper(probe2gene$gene)''') # remove probes that have no gene assignment (i.e returned "" from biomaRt) and those with # multiple gene assignments - cross-hyb temp = P.getTempFile(".") E.info("writing temp file") R('''write.table(probe2gene, file = "%s", sep = "\t", row.names = F)''' % temp.name) temp.close() E.info("filtering probes") inf = open(temp.name) header = inf.readline() outf = open(outfile, "w") outf.write(header) counts = collections.defaultdict(int) probe2gene = {} for line in inf.readlines(): data = line[:-1].split("\t") probe, gene = data[0], data[1] if gene.strip('"') == '': continue probe2gene[probe] = gene counts[probe] += 1 for probe, count in probe2gene.iteritems(): if count > 1: outf.write("%s\t%s\n" % (probe, probe2gene[probe])) outf.close() os.unlink(temp.name) else: R(''' library(limma) # read in data - maintain detection p-values for bg correction dat <- read.ilmn(files = "%s", other.columns = "Detection") probe2gene <- data.frame("probe" = rownames(dat), "gene" = dat$genes$TargetID) write.table(probe2gene, file = "%s", row.names = F, sep = "\t") ''' % (infile, outfile))
def plot_bar(stats, output_file=None, **kw): names = [r['name'] for r in stats.values()[0][0]] with_rates = [r['with_rate'] for r in stats.values()[0][0]] names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)] by_dir = defaultdict(list) for triad in stats: for r in stats[triad]: by_dir[r[0]['from_directory']].append(r) for d in by_dir: by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]]) runs = [] g_stats = [] data = [] alpha = 0 for d, v in by_dir.items(): if 'exons' in d.split('/'): dataset = 'Nuclear' elif 'mtDNA' in d.split('/'): dataset = 'Mitochondrial' else: dataset = 'Microbial' print dataset for j, g in enumerate(v): g_stats += g data += [dataset] * len(g) runs += [j] * len(g) print names[j], sum(1 for _g in g if _g > 0.05) / len(g) alpha = max(alpha, get_alpha(g)) print 'Samples', len(g) labels = 'expression(' + ','.join(names) + ')' df = DataFrame({ 'run': IntVector(runs), 'g_stat': FloatVector(g_stats), 'data': StrVector(data) }) globalenv['df'] = df R('library(scales)') # 'geom_jitter(alpha=0.2, size=1) + ' + \ # 'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \ # 'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \ # 'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \ # 'geom_hline(yintercept=0.05, color="black") + ' + \ cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \ 'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \ 'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \ 'scale_x_discrete(labels=' + labels + ') + ' + \ 'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \ 'facet_grid(. ~ data)' R(cmd) if output_file: R('ggsave("' + output_file + '", gg, width=5, height=5)') else: print R['gg'] raw_input('Press Enter to continue...')
def covarFilter(infile, time_points, replicates, quantile): ''' Filter gene list based on the distribution of the sums of the covariance of each gene. This is highly recommended to reduce the total number of genes used in the dynamic time warping clustering to reduce the computational time. The threshold is placed at the intersection of the expected and observed value for the given quantile. ''' time_points.sort() time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) df = pd.read_table(infile, sep="\t", header=0, index_col=0) df.drop(['replicates'], inplace=True, axis=1) df.drop(['times'], inplace=True, axis=1) df = df.fillna(0.0) # convert data frame and import into R namespace # py2ri requires activation pandas2ri.activate() R.assign('diff_data', pandas2ri.py2ri(df)) E.info("loading data frame") # need to be careful about column headers and transposing data frames R('''trans_data <- data.frame(diff_data)''') R('''times <- c(%s)''' % time_cond.r_repr()) R('''replicates <- c(%s)''' % rep_cond.r_repr()) # calculate the covariance matrix for all genes # sum each gene's covariance vector E.info("calculating sum of covariance of expression") R('''covar.mat <- abs(cov(trans_data))''') R('''sum.covar <- rowSums(covar.mat)''') R('''exp.covar <- abs(qnorm(ppoints(sum.covar),''' '''mean=mean(sum.covar), sd=sd(sum.covar)))''') R('''sum.covar.quant <- quantile(sum.covar)''') R('''exp.covar.quant <- quantile(exp.covar)''') E.info("filter on quantile") R('''filtered_genes <- names(sum.covar[sum.covar > ''' '''sum.covar.quant[%(quantile)i]''' ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals()) R('''filtered_frame <- data.frame(diff_data[, filtered_genes],''' '''times, replicates)''') # load data and convert to pandas object filtered_frame = pandas2ri.ri2py(R["filtered_frame"]).T return filtered_frame
def Rconnect(): ''' connect to a database through R ''' R('''library("RSQLite")''') R('''library("sciplot")''') R('''drv <- dbDriver("SQLite")''') R('''con <- dbConnect(drv, dbname = "%s") ''' % PARAMS["database_name"]) return R('''con''')
def plotFigure1cGCContent(infiles, outfiles): '''Figure 1c: density plots of GC content''' capseq_out, control_out = outfiles indir = os.path.dirname(infiles[0]) scriptsdir = PARAMS["scriptsdir"] R('''source("%(scriptsdir)s/R/proj007/proj007.R") ''' % locals()) R('''speciesPlot(dir="%(indir)s", pattern="*testes-cap.replicated.gc.export", main="Testes CAPseq", xlab="GC Content", filename="%(capseq_out)s", plotcol=2, xlimit=c(0,1), ylimit=c(0,15))''' % locals()) R('''speciesPlot(dir="%(indir)s", pattern="*testes-cap.replicated.gc.export", main="Testes Control", xlab="GC Content", filename="%(control_out)s", plotcol=3, xlimit=c(0,1), ylimit=c(0,15))''' % locals())
def load_stone_in_sling(path_sling, stone_name, exts=['', '.py', '.ipynb']): if not path_sling or not stone_name: print('!! sling or stone not specified') return elif not os.path.exists(path_sling): new_path = None for ext in exts: abs_path_sling_ext = os.path.join(CONFIG['PATH_SLINGS'], path_sling + ext) #print(abs_path_sling_ext) if os.path.exists(abs_path_sling_ext): new_path = abs_path_sling_ext break if not new_path: print("!!", path_sling, "does not exist") return path_sling = new_path if path_sling.endswith('.py'): try: import importlib.util spec = importlib.util.spec_from_file_location("sling", path_sling) sling = importlib.util.module_from_spec(spec) spec.loader.exec_module(sling) except ImportError: import imp sling = imp.load_source('sling', path_sling) stone = getattr(sling, stone_name) return stone elif path_sling.endswith('.ipynb'): import nbimporter nbimporter.options['only_defs'] = CONFIG.get('NBIMPORTER_ONLY_DEFS', False) ppath, pfn = os.path.split(path_sling) pname, pext = os.path.splitext(pfn) NBL = nbimporter.NotebookLoader(path=[ppath]) sling = NBL.load_module(pname) stone = getattr(sling, stone_name) return stone elif path_sling.endswith('.R'): from rpy2.robjects import r as R # load all source with open(path_sling) as f: code = f.read() R('library(RJSONIO)') rfunc = R(code) #print('done!') stone = lambda _path: rconvert(rfunc(_path)) return stone
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") entrez2ensembl = PipelineBiomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict( (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector( ["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def runPCA(infile, outfile, rownames=1): ''' run principle components analysis on normalised matrix ''' # ncol = len(open(infile).readline().strip("\n").split("\t")) # read in and format data R('''dat <- read.csv("%s", header=T, stringsAsFactors=F, sep="\t", row.names=%i)''' % (infile, rownames)) # run PCA R('''pc.dat <- prcomp(as.matrix(t(dat)))''') # get scores R('''pc.dat.scores <- data.frame(pc.dat$x)''') R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''') R('''pc.dat.scores <- pc.dat.scores[, c("sample", colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''') R('''write.table(pc.dat.scores, file="%s", sep="\t", quote=F, row.names=F)''' % outfile) # get the variance explained outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv" R('''ve <- data.frame(summary(pc.dat)$importance)''') R('''ve <- ve[2,]''') R('''write.table(ve, file="%s", sep="\t", quote=F, row.names=F)''' % outf_ve)
def buildLcaProportionsAcrossSamples(infile, outfile, dtype="pathway"): ''' build the proportion of reads mapped to each taxoomic level per sample ''' R('''library(dplyr)''') R('''dat <- read.csv( "%s", header = T, stringsAsFactors = F, sep = "\t", row.names=1 )''' % infile) if dtype == "pathway": R('''dat <- data.frame(dat %>% group_by(taxa) %>% summarise_each(funs(sum)))''') R('''rownames(dat) <- dat$taxa''') R('''dat <- dat[,2:ncol(dat)]''') else: R('''dat <- dat''') R('''dat.t <- data.frame(sweep(as.matrix(dat), 2, colSums(dat), "/"))''') R('''dat.t$taxa <- rownames(dat.t)''') R('''write.table(dat.t, file = "%s", sep = "\t", quote=F, row.names = F)''' % outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--images-dir", dest="images_dir", type="string", help="directory to save hilbert curves image files to") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = argv[-1] pref = infile.split("/")[1].split(".")[0] spec = infile.split("/")[1].split(".")[1].split("-")[1] header = "%s-%s" % (pref, spec) image_dir = options.images_dir if os.path.exists(image_dir): pass else: os.mkdir(image_dir) # set path for R scripts to source lib_dir = os.path.dirname(__file__) root_dir = os.path.dirname(lib_dir) r_dir = os.path.join(root_dir, "R") # test R scripts directory - fail if not present assert r_dir R('''suppressPackageStartupMessages(library(rtracklayer))''') R('''data.rle <- rtracklayer::import.bw(con="%(infile)s", ''' '''as="Rle")''' % locals()) R('''source("%(r_dir)s/wiggle2hilbert.R")''' % locals()) R('''wiggle2Hilbert(wiggleRle=data.rle, ''' '''image.dir="%(image_dir)s", datName="%(header)s")''' % locals()) # write footer and output benchmark information. E.Stop()
def getRPackageList(): '''return a dictionary of installed R packages mapping to their version.''' a = R('''installed.packages( fields=c("Package", "Version"))[,c("Package", "Version")] ''') b = R('''installed.packages( fields=c("Package", "Version"))[,c("Version")] ''') return dict(list(zip(a, b)))
def plotCoverageHistogram(infile, outfile): ''' plot the coverage over kmers ''' inf = P.snip(infile, ".contigs.fa") + ".stats.txt" outf = P.snip(inf, ".txt") + ".pdf" R('''library(plotrix)''') R('''data = read.table("%s", header=TRUE)''' % inf) R('''pdf("%s", height = 7, width = 7 )''' % outf) R('''weighted.hist(data$short1_cov, data$lgth, breaks=seq(0, 200, by=1))''') R["dev.off"]()
def get_data_table_by_id(id, cache): id_type = id[0:3].lower() R.assign("id", id) R.assign("id_type", id_type) R.assign("cache", cache) R(""" library(GEOquery) data = getGEO(id, destdir=cache) """) data = R("Table(data)") data = pandas2ri.ri2py(data) return data
def plotFilteredSamples(infiles, outfiles): '''Create a plot of the SNP profiles for each filtered sample''' error_profile, otu_assignment = infiles otu_assignment = P.snip(otu_assignment, '.fasta') + '_up.txt' otu_dict = {} for row in open(otu_assignment): sample_id = row.split()[0].split(';')[0] otu_id = row.split().pop() otu_dict[sample_id] = otu_id def _fetch_loci(infile): # Some samples have no snps... if not open(infile).readline(): L.warn('Sample %s has no SNPs' % infile) idx = [i for i in range(1, 1501)] snp = [ 0, ] * 1500 df = pd.DataFrame([idx, snp]).transpose() else: df = pd.DataFrame( [x.split(',') for x in open(infile).readline().split('\t')]) df.columns = ['Locus', 'Frequency'] df = df.applymap(float) return df sample_id = P.snip(error_profile, '_true_snps.tsv', strip_path=True) otu_id = otu_dict[sample_id] outfile = os.path.join('14_filter_sample_error_profiles.dir', otu_id + '_' + \ sample_id + '.pdf') R('''rm(list=ls())''') R('''require('ggplot2')''') df = _fetch_loci(error_profile) R.assign('df', df) R('''require('ggplot2') pl <- ggplot(df, aes(x=Locus, xend=Locus, y=0, yend=Frequency)) + geom_segment() pl <- pl + theme_bw() + theme(panel.grid=element_blank()) pl <- pl + xlim(0, 1500) + scale_y_continuous(expand=c(0,0), limits=c(0, 100)) pl <- pl + xlab('Position Along 16S Gene') + ylab('Frequency (%%)') pl <- pl + ggtitle('%s\n%s') pdf('%s', height=3, width=5) plot(pl) dev.off() ''' % (otu_id, sample_id, outfile)) R('''rm(list=ls())''')
def calculatePerSampleMeasurementError(infile, outfile): '''Sample, calculate the measurement error across variabile loci for all technical replicates in one ''' R('''rm(list=ls())''') zeta_w = R(''' zeta_w <- function(df, y="value", x="locus"){ res = anova(lm(value ~ locus, data=df)) return(sqrt(res[["Mean Sq"]][2])) } ''') # Hack... I forgot about samples with only a single locus zeta_w2 = R(''' zeta_w2 <- function(df){ v = apply(df, 1, var) m = mean(v) return(sqrt(m)) } ''') # Open the dataframe and check that there is more than one measurement... df = pd.read_table(infile, sep='\t', index_col=0) if len(df.columns) < 2: pass else: # Fetch the sequencing depths depths = [float(x.split('_')[1]) for x in df.columns] mean_depth = np.mean(depths) sample_id = P.snip(infile, '.tsv', strip_path=True) outf = open(outfile, 'w') outf.write('SampleID\tMeanDepth\tMeasurementError\n') if len(df.index) == 1: L.warn('Sample %s has only one variable lcous' % \ os.path.basename(infile)) m_error = zeta_w2(df)[0] else: # melt the dataframe df['locus'] = [str(x) for x in df.index] df = df.melt(id_vars='locus') # calculate the measurement error m_error = zeta_w(df)[0] outf.write('\t'.join(map(str, [sample_id, mean_depth, m_error])) + '\n') outf.close()
def plotPathwayGenes(infile, outfile): ''' plot the genes that are differentially expressed and fall into pathways ''' # R will not be able to plot anything if none of the # differentially expressed genes are associated # with a pathway. plot nothing if this is the case # colour of the pathways should associate with the # track that they come from # because the plots can get unwieldy with large # gene sets, if there are more than 20 genes # associated with a pathway then take the top 20 # This should be explained in the documentation col = random.sample(range(1,600,1), 1)[0] track = os.path.basename(infile).replace(".genes", "") if len(open(infile).readlines()) == 1: R('''pdf("%s") plot(c(0,1,2,3,4), c(0,1,2,3,4), cex = 0) text(2, y = 2, labels = "No genes were associated with pathways", cex = 1) ''' % outfile.replace(".plots", ".pdf")) P.touch(outfile) else: # NB. size of plot should be proportional to the # number of genes in the pathways R(''' library("ggplot2") dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t") pathways <- unique(dat$pathway) for (p in pathways){ toPlot <- aggregate(l2fold~gene, dat[dat$pathway == p,], mean) if (regexpr("/", p)[1] != -1){ # "/" in name not compatible with outfile names p <- sub("/", "|", p)} outf <- paste(paste("pathways.dir/", paste("%s", p, sep = "."), sep = ""), "genes.pdf", sep = ".") cols <- col2rgb(%i) col <- rgb(cols[1], cols[2], cols[3], maxColorValue = 255) toPlot$col <- col if (nrow(toPlot) > 10){ toPlot <- toPlot[order(abs(toPlot$l2fold), decreasing = T),][1:10,]} plot1 <- ggplot(toPlot, aes(x = gene, y = l2fold, fill = col, stat = "identity")) + geom_bar(stat = "identity") + coord_flip() + scale_fill_manual(values = toPlot$col) plot1 + ggtitle(p) + theme(text = element_text(size = 40, color = "black"), axis.text = element_text(colour = "Black")) ggsave(file = outf, width = 11, height = nrow(toPlot), limitsize = F) } ''' % (infile, track, col)) P.touch(outfile)
def MAPlot(infile, threshold_stat, p_threshold, fc_threshold, outfile): ''' MA plot the results ''' if threshold_stat == "p": p = "P.Value" elif threshold_stat == "padj": p = "adj.P.Val" else: p = "adj.P.Val" R('''library(ggplot2)''') R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % infile) R('''dat$sig <- ifelse(dat$%s < %f & abs(dat$logFC) > %f, 1, 0)''' % (p, p_threshold, fc_threshold)) R('''a <- aes(x = AveExpr, y = logFC, colour = factor(sig))''') R('''plot1 <- ggplot(dat, a)''') R('''plot2 <- plot1 + geom_point(alpha = 0.5)''') R('''plot3 <- plot2 + scale_colour_manual(values = c("black", "blue"))''') R('''ggsave("%s")''' % outfile)
def __init__(self, y, spec_uGarch=None, nums_uGarch=None, DCC_order=None, out_of_sample=0, DCC_distribution='mvnorm'): global id_uGarch_spec id_uGarch_spec += 1 [T, N] = y.shape #creates the default uGarch_spec for every column, if not provided if spec_uGarch == None: spec_uGarch = [uGarch_spec()] nums_uGarch = [y.shape[1]] assert len(spec_uGarch) == len(nums_uGarch) #creates the object multispec (resemble the R one, it contains the multispecification for every Garch) str_vec_spec = '' for spc in range(len(nums_uGarch)): str_vec_spec += f'replicate({nums_uGarch[spc]},{spec_uGarch[spc].R_name}),' str_vec_spec = str_vec_spec.rstrip(',') self.R_uGarch_multispec_name = 'uGarchmulti_spec' + str(id_uGarch_spec) R(f'{self.R_uGarch_multispec_name}=multispec(c({str_vec_spec}))') self.R_uGarch_multispec = R[self.R_uGarch_multispec_name] #creates the global DCC specification object (dccspec, in R) if DCC_order is None: DCC_order = [1, 1] str_DCC_order = mat2rSyntax(DCC_order) self.R_DCC_spec_name = 'DCC_spec' + str(id_uGarch_spec) self.R_DCC_spec = R( f'{self.R_DCC_spec_name} = dccspec(uspec = {self.R_uGarch_multispec_name}, dccOrder = {str_DCC_order}, distribution = \'{DCC_distribution}\')' ) #fits the data with the DCC_spec self.R_DCCfit_name = 'DCC_fit' + str(id_uGarch_spec) R_DCCfit_func = R('dccfit') pandas2ri.activate() if isinstance(y, pd.DataFrame): R_y = pandas2ri.py2ri(y) self.fit = R_DCCfit_func(self.R_DCC_spec, R_y, out_of_sample) else: rpy2.robjects.numpy2ri.activate() self.fit = R_DCCfit_func(self.R_DCC_spec, y, out_of_sample) #creates the empty fields that will be extracted from the R fit object self.fit_rcor = None self._chol_vcv = None self.out_of_sample = out_of_sample #7 refers to: (mu,ar,ma,omega,alpha,beta,gamma) coefficients of the ARMA+GARCH of the title _coef_ = r_coef_method(self.fit) self.coef_ = np.array(_coef_[:-2]).reshape(N, 6) self.global_coef_ = np.array(_coef_[-2:]) self.N = N
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile): ''' boxplot the relative abundance of false positive species compared to true positives ''' tablename_estimate = P.toTable(infiles[0]) track = P.snip( os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load") tablename_true = [ P.toTable(x) for x in infiles[1:] if P.snip(os.path.basename(x), ".load") == track ][0] dbh = sqlite3.connect("csvdb") cc = dbh.cursor() tmp = P.getTempFile(".") tmp.write("taxa\tabundance\tstatus\n") estimate = {} true = set() for data in cc.execute( """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'""" % tablename_estimate).fetchall(): estimate[data[0]] = data[1] for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" % tablename_true).fetchall(): true.add(data[0]) for taxa, abundance in estimate.iteritems(): if taxa in true: tmp.write("%s\t%f\ttp\n" % (taxa, abundance)) else: tmp.write("%s\t%f\tfp\n" % (taxa, abundance)) tmp.close() inf = tmp.name if track.find("15M") != -1: col = "cadetblue" elif track.find("30M") != -1: col = "lightblue" elif track.find("50M") != -1: col = "slategray" R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''library(ggplot2)''') R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")''' % col) R('''ggsave("%s")''' % outfile) os.unlink(inf)
def __new__(cls): c = RBase.__new__(cls) cls._instance = c c._history = [] return cls._instance
def __call__(self, string): self._history.append(string) RBase.__call__(self, string)