def render(self, work, path ): self.startPlot() nplotted = 0 xlabels, ylabels = [], [] if len(work) < 2: raise ValueError( "requiring two coordinates, only got %s" % str(work.keys())) xlabel, ylabel = work.keys()[:2] xvals, yvals = Stats.filterNone( work.values()[:2]) if len(xvals) == 0 or len(yvals) == 0: raise ValueError("no data" ) # apply log transformation on data not on plot if self.logscale: if "x" in self.logscale: xvals = R.log10(xvals) if "y" in self.logscale: yvals = R.log10(yvals) R.smoothScatter( xvals, yvals, xlab=xlabel, ylab=ylabel, nbin = self.nbins ) return self.endPlot( work, path )
def render(self, dataframe, path ): if len(dataframe.columns) < 2: raise ValueError( "requiring two coordinates, only got %s" % str(dataframe.columns)) plts, legend = [], [] blocks = ResultBlocks() for xcolumn, ycolumn in itertools.combinations( dataframe.columns, 2 ): # remove missing data points xvalues, yvalues = Stats.filterMissing( (dataframe[xcolumn], dataframe[ycolumn]) ) # remove columns with all NaN if len(xvalues) == 0 or len(yvalues) == 0: continue # apply log transformation on data not on plot if self.logscale: if "x" in self.logscale: xvalues = R.log10(xvalues) if "y" in self.logscale: yvalues = R.log10(yvalues) self.startPlot() # wrap, as pandas series can not # passed through rpy2. R.smoothScatter( numpy.array( xvalues, dtype=numpy.float), numpy.array( yvalues, dtype=numpy.float), xlab=xcolumn, ylab=ycolumn, nbin = self.nbins ) blocks.extend( self.endPlot( dataframe, path ) ) return blocks
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match( "([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def buildDMRStats(tables, method, outfile, dbhandle): """build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing """ def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write( "\t".join( ( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals(), ).fetchall(), 3, ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write( "\t".join( map( str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k], ), ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, """SELECT end - start, pvalue FROM %(tablename)s WHERE significant""" % locals(), ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter( R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab="log10(length)", ylab="log10(pvalue)", log="x", pch=20, cex=0.1, ) R["dev.off"]() outf.close()
def buildDMRStats( tables, method, outfile ): '''build dmr summary statistics. Creates some diagnostic plots in <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. ''' dbhandle = sqlite3.connect( PARAMS["database"] ) def togeneset( tablename ): return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile( outfile, "w" ) outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested", "\t".join( [ "status_%s" % x for x in keys_status ] ), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables( dbhandle )) outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" ) for tablename in tables: prefix = P.snip( tablename, "_%s" % method ) tileset, design = prefix.split("_") def toDict( vals, l = 2 ): return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] ) E.info( "collecting data from %s" % tablename ) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name,control_name) outf.write( "\t".join(map(str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k] ) ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant'''% locals() ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png( pngfile ) R.smoothScatter( R.log10( ro.FloatVector(data[0]) ), R.log10( ro.FloatVector(data[1]) ), xlab = 'log10( length )', ylab = 'log10( pvalue )', log="x", pch=20, cex=.1 ) R['dev.off']() outf.close()
def buildExpressionStats( dbhandle, outfile, tablenames, outdir, regex_table="(?P<design>[^_]+)_" "(?P<geneset>[^_]+)_" "(?P<counting_method>[^_]+)_" "(?P<method>[^_]+)_" "(?P<level>[^_]+)_diff"): """compile expression summary statistics from database. This method outputs a table with the number of genes tested, failed, differentially expressed, etc. for a series of DE tests. Arguments --------- dbhandle : object Database handle. tables : list List of tables to process. outfile : string Output filename in :term:`tsv` format. outdir : string Output directory for diagnostic plots. regex : string Regular expression to extract experimental information from table name. """ keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "counting_method", "treatment_name", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") for tablename in tablenames: r = re.search(regex_table, tablename) if r is None: raise ValueError( "can't match tablename '%s' to regex" % tablename) geneset = r.group("geneset") design = r.group("design") level = r.group("level") counting_method = r.group("counting_method") geneset = r.group("geneset") def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s" "_pvalue_vs_length.png") % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match("([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join( map(str, (design, geneset, level, counting_method, treatment_name, control_name, tested[ (treatment_name, control_name)], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()