def createViewMapping(infile, outfile): '''create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats ''' tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b ''' Database.executewait(dbhandle, statement % locals())
def createViewMapping(infile, outfile): """create view in database for alignment stats. This view aggregates all information on a per-track basis. The table is built from the following tracks: mapping_stats bam_stats """ tablename = P.toTable(outfile) # can not create views across multiple database, so use table view_type = "TABLE" dbhandle = connect() Database.executewait(dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) statement = """ CREATE %(view_type)s %(tablename)s AS SELECT * FROM bam_stats AS b """ Database.executewait(dbhandle, statement % locals())
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()] genelists = [x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join( infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def loadCodingPotential(infile, outfile): '''load annotations''' table = P.toTable(outfile) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect(PARAMS["database"]) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg( {"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedFeatureCounts(infile, outfile): '''Count no genes detected by featureCount at counts > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct h.*, gene_biotype from %(table)s h inner join anndb.gene_info i on h.gene_id=i.gene_id ''' % locals() melted_df = DB.fetch_DataFrame(statement, DATABASE, attach) grouped_df = melted_df.groupby(["gene_biotype", "track"]) agg_df = grouped_df.agg({"counts": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="track", values="counts", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def ReadGene2GOFromDatabase(dbhandle, go_type, database, species): """read go assignments from ensembl database. returns a dictionary of lists. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information Note: assumes that external_db_id for GO is 1000 """ statement = GetGOStatement(go_type, database, species) result = Database.executewait(dbhandle, statement, retries=0).fetchall() gene2go = {} go2info = collections.defaultdict(GOInfo) for gene_id, goid, description, evidence in result: gm = GOMatch(goid, go_type, description, evidence) gi = GOInfo(goid, go_type, description) if gene_id not in gene2go: gene2go[gene_id] = [] gene2go[gene_id].append(gm) go2info[goid] = gi return gene2go, go2info
def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [x[:-len("_0")] if x.endswith("_0") else x for x in df.columns] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg({"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def numberGenesDetectedCufflinks(infile, outfile): '''Count no genes detected at copynumer > 0 in each sample''' table = P.toTable(infile) attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals() statement = '''select distinct c.*, gene_biotype from %(table)s c inner join anndb.gene_info i on c.tracking_id=i.gene_id ''' % locals() df = DB.fetch_DataFrame(statement, DATABASE, attach) # snip off the cufflinks replicate field df.columns = [ x[:-len("_0")] if x.endswith("_0") else x for x in df.columns ] melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"]) grouped_df = melted_df.groupby(["gene_biotype", "variable"]) agg_df = grouped_df.agg( {"value": lambda x: np.sum([1 for y in x if y > 0])}) agg_df.reset_index(inplace=True) count_df = pd.pivot_table(agg_df, index="variable", values="value", columns="gene_biotype") count_df["total"] = count_df.apply(np.sum, 1) count_df["sample_id"] = count_df.index count_df.to_csv(outfile, index=False, sep="\t")
def importCodingPotential( infile, outfile ): '''import annotations''' table = outfile[:-len(".import")] statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s < %(infile)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (f_iscoding == 'coding') OR (r_iscoding == 'coding') ''' % locals()) dbhandle.commit()
def mergeEffectsPerGene( infile, outfile ): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT track, gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id, track ''' % locals() Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() ) Database.executewait( dbhandle, statement ) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def loadCodingPotential( infile, outfile ): '''load annotations''' table = P.toTable( outfile ) statement = ''' gunzip < %(infile)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty --index=gene_id --map=gene_id:str --table=%(table)s > %(outfile)s''' P.run() # set the is_coding flag dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''ALTER TABLE %(table)s ADD COLUMN is_coding INTEGER''' % locals()) Database.executewait( dbhandle, '''UPDATE %(table)s SET is_coding = (result == 'coding')''' % locals()) dbhandle.commit()
def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_name, options.species) results = Database.executewait( dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("\t".join(map(str, (go_type,) + result)) + "\n") gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_name, options.species) results = Database.executewait(dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n") gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: analyze_go.py 309 2005-12-01 15:50:26Z andreas $") dbhandle = Database.Database() parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.set_defaults(species="dmelanogaster") parser.set_defaults(database="ensembl_mart_31") parser.set_defaults(prefix="dm_go_") (options, args) = E.Start(parser, add_mysql_options=True) dbhandle.Connect(options) WriteBackground("biol_process", options, "bp") WriteBackground("cell_location", options, "lm") WriteBackground("mol_function", options, "fm")
def summarizeEffectsPerGene(infile, outfile): '''summarize effects on a per-gene level.''' tablename = outfile[:-len(".load")] track = infile[:-len("_effects.load")] dbhandle = connect() statement = ''' CREATE TABLE %(tablename)s AS SELECT DISTINCT gene_id, COUNT(*) AS ntranscripts, MIN(e.nalleles) AS min_nalleles, MAX(e.nalleles) AS max_nalleles, MIN(e.stop_min) AS min_stop_min, MAX(e.stop_min) AS max_stop_min, MIN(e.stop_max) AS min_stop_max, MAX(e.stop_max) AS max_stop_max, SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_knockout, SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1 ELSE 0 END) AS nmd_affected FROM annotations.transcript_info as i, %(track)s_effects AS e WHERE i.transcript_id = e.transcript_id GROUP BY i.gene_id ''' % locals() Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals()) Database.executewait(dbhandle, statement) Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals()) dbhandle.commit() P.touch(outfile)
def buildExpressionStats( dbhandle, outfile, tablenames, outdir, regex_table="(?P<design>[^_]+)_" "(?P<geneset>[^_]+)_" "(?P<counting_method>[^_]+)_" "(?P<method>[^_]+)_" "(?P<level>[^_]+)_diff"): """compile expression summary statistics from database. This method outputs a table with the number of genes tested, failed, differentially expressed, etc. for a series of DE tests. Arguments --------- dbhandle : object Database handle. tables : list List of tables to process. outfile : string Output filename in :term:`tsv` format. outdir : string Output directory for diagnostic plots. regex : string Regular expression to extract experimental information from table name. """ keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "counting_method", "treatment_name", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") for tablename in tablenames: r = re.search(regex_table, tablename) if r is None: raise ValueError( "can't match tablename '%s' to regex" % tablename) geneset = r.group("geneset") design = r.group("design") level = r.group("level") counting_method = r.group("counting_method") geneset = r.group("geneset") def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = ("%(outdir)s/%(design)s_%(geneset)s_%(level)s" "_pvalue_vs_length.png") % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0): '''load results from cuffdiff analysis to database This functions parses and loads the results of a cuffdiff differential expression analysis. Parsing is performed by the parseCuffdiff function. Multiple tables will be created as cuffdiff outputs information on gene, isoform, tss, etc. levels. The method converts from ln(fold change) to log2 fold change. Pairwise comparisons in which one gene is not expressed (fpkm < `min_fpkm`) are set to status 'NOCALL'. These transcripts might nevertheless be significant. Arguments --------- dbhandle : object Database handle. infile : string Input filename, output from cuffdiff outfile : string Output filename in :term:`tsv` format. min_fpkm : float Minimum fpkm. Genes with an fpkm lower than this will be set to status `NOCALL`. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued... tmpname = P.getTempFilename(shared=True) # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) P.load(tmpname, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=treatment_name " "--add-index=control_name " "--add-index=test_id") for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" infile = os.path.join(indir, fn) P.load(infile, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=tracking_id " "--add-index=control_name " "--add-index=test_id") # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} is_first = True for line in inf: if is_first: is_first = False continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # please make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") s = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[s]] + "\t") s += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() P.load(tmpf, outfile, tablename=tablename, options="--allow-empty-file " " --add-index=gene_id") os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append( [x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError( "number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join( ["t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:])]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError( "empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--mappability-file", dest="mappability", type="string", help="Bigwig file with mappability") parser.add_option("-d", "--database", dest="db", type="string", help="Database containing intron chunck table") parser.add_option("-l", "--read-length", dest="rlen", type="int", default=50, help="Read length") parser.add_option("-o", "--overlap-length", dest="olen", type="int", default=10, help="Min overlap before read is counted") parser.add_option("-M", "--multimap", dest="mm", action="store_true", default=False, help="Allow multimapping") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) introns = Database.fetch( """SELECT gene_id, exon_id FROM reference_chunks_introns WHERE intron>0""", Database.connect(options.db)) introns = [tuple(x) for x in introns] mappability = pyBigWig.open(options.mappability) options.stdout.write("\t".join(["gene_id", "exon_id", "efflen"]) + "\n") for exon in GTF.iterator(options.stdin): if (unicode(exon.gene_id), int(exon.exon_id)) not in introns: continue vals = mappability.values( exon.contig, int(exon.start) - (options.rlen - options.olen), int(exon.end) - options.olen) if options.mm: eff_len = sum(vals) else: eff_len = int(exon.end) - int(exon.start) \ + options.rlen \ - 2 * options.olen \ - len([x for x in vals if x < 1]) options.stdout.write( "\t".join([exon.gene_id, exon.exon_id, str(eff_len)]) + "\n") # write footer and output benchmark information. E.Stop()
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def createView(dbhandle, tables, tablename, outfile, view_type="TABLE", ignore_duplicates=True): '''create a database view for a list of tables. This method performs a join across multiple tables and stores the result either as a view or a table in the database. Arguments --------- dbhandle : A database handle. tables : list of tuples Tables to merge. Each tuple contains the name of a table and the field to join with the first table. For example:: tables = ( "reads_summary", "track", "bam_stats", "track", "context_stats", "track", "picard_stats_alignment_summary_metrics", "track") tablename : string Name of the view or table to be created. outfile : string Output filename for status information. view_type : string Type of view, either ``VIEW`` or ``TABLE``. If a view is to be created across multiple databases, use ``TABLE``. ignore_duplicates : bool If set to False, duplicate column names will be added with the tablename as prefix. The default is to ignore. ''' Database.executewait( dbhandle, "DROP %(view_type)s IF EXISTS %(tablename)s" % locals()) tracks, columns = [], [] tablenames = [x[0] for x in tables] for table, track in tables: d = Database.executewait( dbhandle, "SELECT COUNT(DISTINCT %s) FROM %s" % (track, table)) tracks.append(d.fetchone()[0]) columns.append([ x.lower() for x in Database.getColumnNames(dbhandle, table) if x != track ]) E.info("creating %s from the following tables: %s" % (tablename, str(list(zip(tablenames, tracks))))) if min(tracks) != max(tracks): raise ValueError("number of rows not identical - will not create view") from_statement = " , ".join( ["%s as t%i" % (y[0], x) for x, y in enumerate(tables)]) f = tables[0][1] where_statement = " AND ".join([ "t0.%s = t%i.%s" % (f, x + 1, y[1]) for x, y in enumerate(tables[1:]) ]) all_columns, taken = [], set() for x, c in enumerate(columns): i = set(taken).intersection(set(c)) if i: E.warn("duplicate column names: %s " % i) if not ignore_duplicates: table = tables[x][0] all_columns.extend( ["t%i.%s AS %s_%s" % (x, y, table, y) for y in i]) c = [y for y in c if y not in i] all_columns.extend(["t%i.%s" % (x, y) for y in c]) taken.update(set(c)) all_columns = ",".join(all_columns) statement = ''' CREATE %(view_type)s %(tablename)s AS SELECT t0.track, %(all_columns)s FROM %(from_statement)s WHERE %(where_statement)s ''' % locals() Database.executewait(dbhandle, statement) nrows = Database.executewait( dbhandle, "SELECT COUNT(*) FROM view_mapping").fetchone()[0] if nrows == 0: raise ValueError("empty view mapping, check statement = %s" % (statement % locals())) if nrows != min(tracks): E.warn("view creates duplicate rows, got %i, expected %i" % (nrows, min(tracks))) E.info("created view_mapping with %i rows" % nrows) touchFile(outfile)
def qcSummary(infiles, outfile): '''create a summary table of relevant QC metrics''' # Some QC metrics are specific to paired end data if PAIRED: exclude = [] paired_columns = '''READ_PAIRS_EXAMINED as no_pairs, PERCENT_DUPLICATION as pct_duplication, ESTIMATED_LIBRARY_SIZE as library_size, PCT_READS_ALIGNED_IN_PAIRS as pct_reads_aligned_in_pairs, MEDIAN_INSERT_SIZE as median_insert_size, ''' pcat = "PAIR" else: exclude = ["qc_library_complexity", "qc_insert_size_metrics"] paired_columns = '' pcat = "UNPAIRED" tables = [P.toTable(x) for x in infiles if P.toTable(x) not in exclude] t1 = tables[0] name_fields = PARAMS["name_field_titles"].strip() stat_start = '''select distinct %(name_fields)s, sample_information.sample_id, fraction_spliced, fraction_spike, qc_no_genes_cufflinks.protein_coding as cufflinks_no_genes_pc, qc_no_genes_cufflinks.total as cufflinks_no_genes, qc_no_genes_featurecounts.protein_coding as featurecounts_no_genes_pc, qc_no_genes_featurecounts.total as featurecounts_no_genes, three_prime_bias as three_prime_bias, nreads_uniq_map_genome, nreads_uniq_map_spike, %(paired_columns)s PCT_MRNA_BASES as pct_mrna, PCT_CODING_BASES as pct_coding, PCT_PF_READS_ALIGNED as pct_reads_aligned, TOTAL_READS as total_reads, PCT_ADAPTER as pct_adapter, PF_HQ_ALIGNED_READS*1.0/PF_READS as pct_pf_reads_aligned_hq from %(t1)s ''' % locals() join_stat = "" for table in tables[1:]: join_stat += "left join " + table + "\n" join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n" where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s" ''' % locals() statement = "\n".join([stat_start, join_stat, where_stat]) df = DB.fetch_DataFrame(statement, PARAMS["database_name"]) df.to_csv(outfile, sep="\t", index=False)
def loadHypergeometricAnalysis(infile, outfile): '''load GO results.''' track = P.toTable(outfile) tablename = 'hypergeometric_%s_summary' % track P.load(infile, outfile, tablename=tablename) dbh = connect() ontologies = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall() ] genelists = [ x[0] for x in Database.executewait( dbh, '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall() ] # output files from runGO.py sections = ('results', 'parameters', 'withgenes') for section in sections: tablename = 'hypergeometric_%s_%s' % (track, section) statement = ''' python %(scriptsdir)s/combine_tables.py --cat=track --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s" hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --table=%(tablename)s >> %(outfile)s''' P.run() for ontology in ontologies: fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology) if not os.path.exists(fn): E.warn("file %s does not exist" % fn) continue P.load(fn, outfile, tablename='hypergeometric_%s_%s_l2fold' % (track, ontology), options='--allow-empty') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology), options='--allow-empty') fn = os.path.join(infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology) P.load(fn, outfile, tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology), options='--allow-empty')
def loadSummary( infile, outfile ): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists( "%s_rates.load" % track ): stmt_select.append( "a.distance AS ks, a.aligned AS aligned" ) stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''' ) if os.path.exists( "%s_coverage.load" % track ): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage" ) stmt_from.append("LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id" ) if os.path.exists( "%s_repeats_gc.load" % track ): stmt_select.append("ar_gc.exons_mean AS repeats_gc" ) stmt_from.append("LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" ) if os.path.exists( "%s_repeats_rates.load" % track ): stmt_select.append("ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" ) stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''' ) if os.path.exists( "%s_introns_rates.load" % track ): stmt_select.append("ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" ) stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''' ) x = locals() x.update( PARAMS ) stmt_select = ", ".join( stmt_select ) % x stmt_from = " ".join( stmt_from ) % x stmt_where = " AND ".join( stmt_where ) % x dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals() ) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait( dbhandle, statement) dbhandle.commit() P.touch(outfile)
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match( "([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join( ("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict( int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) status = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals() ).fetchall()) fold2 = toDict(Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join(map(str, ( design, geneset, level, counting_method, treatment_name, control_name, tested[(treatment_name, control_name)], "\t".join( [str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def loadSummary(infile, outfile): '''load several rates into a single convenience table. ''' stmt_select = [] stmt_from = [] stmt_where = ["1"] track = infile[:-len(".gtf.gz")] tablename = "%s_evol" % track if os.path.exists("%s_rates.load" % track): stmt_select.append("a.distance AS ks, a.aligned AS aligned") stmt_from.append('''LEFT JOIN %(track)s_rates AS a ON r.gene_id = a.gene_id AND a.aligned >= %(rates_min_aligned)i AND a.distance <= %(rates_max_rate)f''') if os.path.exists("%s_coverage.load" % track): stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage") stmt_from.append( "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id") if os.path.exists("%s_repeats_gc.load" % track): stmt_select.append("ar_gc.exons_mean AS repeats_gc") stmt_from.append( "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id" ) if os.path.exists("%s_repeats_rates.load" % track): stmt_select.append( "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska" ) stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar ON r.gene_id = ar.gene_id AND ar.exons_nval >= %(rates_min_repeats)i''') if os.path.exists("%s_introns_rates.load" % track): stmt_select.append( "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski" ) stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir ON r.gene_id = ir.gene_id AND ir.aligned >= %(rates_min_aligned)i''') x = locals() x.update(PARAMS) stmt_select = ", ".join(stmt_select) % x stmt_from = " ".join(stmt_from) % x stmt_where = " AND ".join(stmt_where) % x dbhandle = sqlite3.connect(PARAMS["database"]) Database.executewait(dbhandle, "DROP TABLE IF EXISTS %(tablename)s " % locals()) statement = ''' CREATE TABLE %(tablename)s AS SELECT CAST(r.gene_id AS TEXT) AS gene_id, r.exons_sum as length, r.exons_pGC as pgc, %(stmt_select)s FROM %(track)s_annotation AS r %(stmt_from)s WHERE %(stmt_where)s ''' % locals() Database.executewait(dbhandle, statement) dbhandle.commit() P.touch(outfile)
def loadCuffdiff(infile, outfile, min_fpkm=1.0): '''load results from differential expression analysis and produce summary plots. Note: converts from ln(fold change) to log2 fold change. The cuffdiff output is parsed. Pairwise comparisons in which one gene is not expressed (fpkm < fpkm_silent) are set to status 'NOCALL'. These transcripts might nevertheless be significant. This requires the cummeRbund library to be present in R. ''' prefix = P.toTable(outfile) indir = infile + ".dir" if not os.path.exists(indir): P.touch(outfile) return # E.info( "building cummeRbund database" ) # R('''library(cummeRbund)''') # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' ) # to be continued dbhandle = sqlite3.connect(PARAMS["database"]) tmpname = P.getTempFilename(".") # ignore promoters and splicing - no fold change column, but sqrt(JS) for fn, level in (("cds_exp.diff.gz", "cds"), ("gene_exp.diff.gz", "gene"), ("isoform_exp.diff.gz", "isoform"), # ("promoters.diff.gz", "promotor"), # ("splicing.diff.gz", "splice"), ("tss_group_exp.diff.gz", "tss")): tablename = prefix + "_" + level + "_diff" infile = os.path.join(indir, fn) results = parseCuffdiff(infile, min_fpkm=min_fpkm) Expression.writeExpressionResults(tmpname, results) statement = '''cat %(tmpname)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=treatment_name --add-index=control_name --add-index=test_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() for fn, level in (("cds.fpkm_tracking.gz", "cds"), ("genes.fpkm_tracking.gz", "gene"), ("isoforms.fpkm_tracking.gz", "isoform"), ("tss_groups.fpkm_tracking.gz", "tss")): tablename = prefix + "_" + level + "_levels" statement = '''zcat %(indir)s/%(fn)s | python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=tracking_id --table=%(tablename)s >> %(outfile)s.log ''' P.run() # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb # IMS: First read in lookup table for CuffDiff/Pipeline sample name # conversion inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz")) inf.readline() sample_lookup = {} for line in inf: line = line.split("\t") our_sample_name = IOTools.snip(line[0]) our_sample_name = re.sub("-", "_", our_sample_name) cuffdiff_sample_name = "%s_%s" % (line[1], line[2]) sample_lookup[cuffdiff_sample_name] = our_sample_name inf.close() for fn, level in (("cds.read_group_tracking.gz", "cds"), ("genes.read_group_tracking.gz", "gene"), ("isoforms.read_group_tracking.gz", "isoform"), ("tss_groups.read_group_tracking.gz", "tss")): tablename = prefix + "_" + level + "sample_fpkms" tmpf = P.getTempFilename(".") inf = IOTools.openFile(os.path.join(indir, fn)).readlines() outf = IOTools.openFile(tmpf, "w") samples = [] genes = {} x = 0 for line in inf: if x == 0: x += 1 continue line = line.split() gene_id = line[0] condition = line[1] replicate = line[2] fpkm = line[6] status = line[8] sample_id = condition + "_" + replicate if sample_id not in samples: samples.append(sample_id) # IMS: The following block keeps getting its indenting messed # up. It is not part of the 'if sample_id not in samples' block # plesae make sure it does not get made part of it if gene_id not in genes: genes[gene_id] = {} genes[gene_id][sample_id] = fpkm else: if sample_id in genes[gene_id]: raise ValueError( 'sample_id %s appears twice in file for gene_id %s' % (sample_id, gene_id)) else: if status != "OK": genes[gene_id][sample_id] = status else: genes[gene_id][sample_id] = fpkm samples = sorted(samples) # IMS - CDS files might be empty if not cds has been # calculated for the genes in the long term need to add CDS # annotation to denovo predicted genesets in meantime just # skip if cds tracking file is empty if len(samples) == 0: continue headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples]) outf.write(headers + "\n") for gene in genes.iterkeys(): outf.write(gene + "\t") x = 0 while x < len(samples) - 1: outf.write(genes[gene][samples[x]] + "\t") x += 1 # IMS: Please be careful with this line. It keeps getting moved # into the above while block where it does not belong outf.write(genes[gene][samples[len(samples) - 1]] + "\n") outf.close() statement = ("cat %(tmpf)s |" " python %(scriptsdir)s/csv2db.py " " %(csv2db_options)s" " --allow-empty-file" " --add-index=gene_id" " --table=%(tablename)s" " >> %(outfile)s.log") P.run() os.unlink(tmpf) # build convenience table with tracks tablename = prefix + "_isoform_levels" tracks = Database.getColumnNames(dbhandle, tablename) tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")] tmpfile = P.getTempFile(dir=".") tmpfile.write("track\n") tmpfile.write("\n".join(tracks) + "\n") tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def buildDMRStats( tables, method, outfile ): '''build dmr summary statistics. Creates some diagnostic plots in <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. ''' dbhandle = sqlite3.connect( PARAMS["database"] ) def togeneset( tablename ): return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile( outfile, "w" ) outf.write( "\t".join( ("tileset", "design", "track1", "track2", "tested", "\t".join( [ "status_%s" % x for x in keys_status ] ), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables( dbhandle )) outdir = os.path.join( PARAMS["exportdir"], "diff_methylation" ) for tablename in tables: prefix = P.snip( tablename, "_%s" % method ) tileset, design = prefix.split("_") def toDict( vals, l = 2 ): return collections.defaultdict( int, [ (tuple( x[:l]), x[l]) for x in vals ] ) E.info( "collecting data from %s" % tablename ) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals() ).fetchall(), 3 ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals() ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals() ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name,control_name) outf.write( "\t".join(map(str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join( [ str(status[(treatment_name,control_name,x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k] ) ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, '''SELECT end - start, pvalue FROM %(tablename)s WHERE significant'''% locals() ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png( pngfile ) R.smoothScatter( R.log10( ro.FloatVector(data[0]) ), R.log10( ro.FloatVector(data[1]) ), xlab = 'log10( length )', ylab = 'log10( pvalue )', log="x", pch=20, cex=.1 ) R['dev.off']() outf.close()
def main(): parser = E.OptionParser( version="%prog version: $Id: GO.py 2883 2010-04-07 08:46:22Z andreas $", usage=globals()["__doc__"]) dbhandle = Database.Database() parser.add_option("-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option( "-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories [default=%default].") parser.add_option( "-g", "--genes", dest="filename_genes", type="string", help="filename with genes to analyse [default=%default].") parser.add_option( "-b", "--background", dest="filename_background", type="string", help="filename with background genes to analyse [default=%default].") parser.add_option( "-m", "--minimum-counts", dest="minimum_counts", type="int", help= "minimum count - ignore all categories that have fewer than # number of genes" " [default=%default].") parser.add_option("-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option( "--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested separately." " [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help= "significance threshold [>1.0 = all ]. If --fdr is set, this refers to the fdr, otherwise it is a cutoff for p-values." ) parser.add_option( "--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile [default=%default]." ) parser.add_option( "--filename-gene2name", dest="filename_gene2name", type="string", help= "optional filename mapping gene identifiers to gene names [default=%default]." ) parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option( "--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile [default=%default]." ) parser.add_option("--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help= "pattern with output filename pattern (should contain: %(go)s and %(section)s ) [default=%default]" ) parser.add_option( "--fdr", dest="fdr", action="store_true", help= "calculate and filter by FDR [ReadGene2GOFromFiledefault=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help= "convert go assignments in STDIN to goslim assignments and write to STDOUT [default=%default]." ) parser.add_option( "--gene-pattern", dest="gene_pattern", type="string", help= "pattern to transform identifiers to GO gene names [default=%default]." ) parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string", help= "write mapping between GO categories and GOSlims [default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background [default=%default]." ) parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None) (options, args) = E.Start(parser, add_mysql_options=True) if options.go2goslim: convertGo2Goslim(options) E.Stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# ## dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location" ] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle.Connect(options) outfile = IOTools.openFile(options.filename_dump, "w", create_dir=True) DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.Stop() sys.exit(0) ############################################################# ## read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = IOTools.openFile(options.filename_input) gene2gos, go2infos = ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = IOTools.openFile(options.filename_gene2name) gene2name = IOTools.readMap(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: gene2name = None ############################################################# ## read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.openFile(options.filename_ontology) ontology = readOntology(infile) infile.close() def _g(): return collections.defaultdict(GOInfo) go2infos = collections.defaultdict(_g) ## substitute go2infos for go in ontology.values(): go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# ## get foreground gene list input_foreground, genelists = ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# ## get background if options.filename_background: # nick - bug fix: background is the first tuple element from ReadGeneLists input_background = ReadGeneLists(options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# ## sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = gene2gos.keys() E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join( ("genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments")) + "\n") ############################################################# ## get go categories for genes for test_ontology in options.ontology: # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# ## get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = ReadGene2GOFromDatabase(dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn( "could not find information for terms - could be mismatch between ontologies" ) ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go) E.info( "assignments found: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set([ x for x, y in counts_per_category.iteritems() if y < options.minimum_counts ]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go) E.info( "assignments after filtering: %i genes mapped to %i categories (%i maps)" % (ngenes, ncategories, nmaps)) for genelist_name, foreground in genelists.iteritems(): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## ## build background - reconcile with foreground ################################################################## if input_background == None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % (len(missing), str(missing)) else: if len(missing) != 0: E.warn( "%i genes in foreground that are not in background - added to background of %i" %\ (len(missing), len(background)) ) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) ############################################################# ## sanity checks: ## are all of the foreground genes in the dataset ## missing = set(genes).difference( set(gene2go.keys()) ) ## assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# ## read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GetGOSlims( IOTools.openFile(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in go_slims.values(): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" %\ ( options.filename_slims, len(go_slims), len( v ) )) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = IOTools.openFile(options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in go_slims.items(): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = CountGO( gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to %i categories (%i maps)\n" % (ngenes, ncategories, nmaps)) ############################################################# ## Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in gene2go.items(): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) ## skip to next GO class if not (bg or ng): continue options.stdout.write("# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in fg: options.stdout.write("%s\t%s\n" % ("fg", x)) for x in bg: options.stdout.write("%s\t%s\n" % ("bg", x)) for x in ng: options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.Stop() sys.exit(0) ############################################################# outfile = getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = getFileName(options, go=test_ontology, section='background', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background[0])))) if options.output_filename_pattern: outfile.close() ############################################################# ## do the analysis go_results = AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = go_results.mResults.items() ############################################################# ## calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(lambda x, y: cmp(x[1].mQValue, y[1].mQValue)) elif options.sort_order == "ratio": pairs.sort(lambda x, y: cmp(x[1].mRatio, y[1].mRatio)) elif options.sort_order == "pvalue": pairs.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) ############################################################# ############################################################# ############################################################# ## output the full result outfile = getFileName(options, go=test_ontology, section='overall', set=genelist_name) outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = getFileName(options, go=test_ontology, section='results', set=genelist_name) outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# ## output parameters ngenes, ncategories, nmaps, counts_per_category = CountGO(gene2go) outfile = getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write("mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write("genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (len(go_results.mBackgroundGenes))) outfile.write("associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (IOTools.prettyPercent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write("significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write("threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append( "\t".join( map(str, ( \ genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, IOTools.prettyPercent( len(go_results.mSampleGenes) , len(foreground), "%5.2f" ), IOTools.prettyPercent( len(go_results.mBackgroundGenes), nbackground, "%5.2f" ), ",".join( msgs) ) ) ) + "\n" ) ############################################################# ############################################################# ############################################################# ## output the fg patterns outfile = getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ###################################################################### ###################################################################### ###################################################################### ## output various summary files ## significant results outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') ## all results outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.Stop()
def generatePeakSets(infile, outfiles): outf_con, outf_opt = outfiles # retrieve maximum number of peaks obtained from inter-replicate IDR # (table created by loadNPeaksForIndividualReplicates) statement = ("SELECT" " Experiment," " max(n_peaks) AS nPeaks" " FROM individual_replicates_nPeaks" " GROUP BY experiment") df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df = df.set_index("Experiment") # retrieve number of peaks obtained from pooled_pseudoreplicate IDR # (table created by loadNPeaksForPooledPseudoreplicates) statement = ("SELECT" " Experiment," " n_peaks AS nPeaks" " FROM pooled_pseudoreplicates_nPeaks") df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name']) # reassign experiment as index df2 = df2.set_index("Experiment") # split the infile name to obtain experiment sample_id = os.path.basename(infile).split("_VS_")[0] sample = sample_id.split("-") experiment = "_".join([sample[0], sample[1]]) # retrieve max_numPeaks for experiment nPeaks = int(df.loc[experiment]) # retrieve numPeaks_Rep0 for experiment nPeaks_rep0 = int(df2.loc[experiment]) # retrieve maximumn of the two nPeaks_max = max(nPeaks, nPeaks_rep0) # establish which column to sort by if PARAMS["idr_options_ranking_measure"] == "signal.value": sort_statement = "sort -k7nr,7nr" elif PARAMS["idr_options_ranking_measure"] == "p.value": sort_statement = "sort -k8nr,8nr" elif PARAMS["idr_options_ranking_measure"] == "q.value": sort_statement = "sort -k9nr,9nr" else: raise ValueError("Unrecognised ranking_measure" " %s don't know which column" " to sort on" % PARAMS["idr_options_ranking_measure"]) # sort infile by column and write top nPeaks to outfile (conservative) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks)s |" " gzip > %(outf_con)s") P.run() # sort infile by column and write top nPeaks_max to outfile (optimum) ignore_pipe_errors = True statement = ("zcat %(infile)s |" " %(sort_statement)s |" " head -%(nPeaks_max)s |" " gzip > %(outf_opt)s") P.run()
def importLincRNA( infile, outfile ): '''build a linc RNA set. * no coding potential * unknown and intergenic transcripts * no overlap with ``linc_exclude`` (usually: human refseq) * at least ``linc_min_length`` bp in length * at least ``linc_min_reads`` reads in transcript ''' table = outfile[:-len(".import")] track = table[:-len("Linc")] dbhandle = sqlite3.connect( PARAMS["database"] ) Database.executewait( dbhandle, '''DROP TABLE IF EXISTS %(table)s''' % locals()) Database.executewait( dbhandle, '''CREATE TABLE %(table)s (gene_id TEXT)''' % locals()) Database.executewait( dbhandle, '''CREATE INDEX %(table)s_index1 ON %(table)s (gene_id)''' % locals()) joins, wheres = [], ["1"] if PARAMS["linc_min_reads"] > 0: joins.append( ", %(track)s_coverage as cov" % locals() ) wheres.append( "cov.gene_id = m.gene_id2 AND cov.nmatches >= %(i)" % PARAMS["linc_min_reads"] ) if PARAMS["linc_exclude"] > 0: joins.append( "LEFT JOIN %s_vs_%s_ovl as ovl on ovl.gene_id2 = a.gene_id" %\ (PARAMS["linc_exclude"], track ) ) wheres.append( "ovl.gene_id1 IS NULL" ) wheres = " AND ".join( wheres ) joins = " ".join( joins ) statement = '''INSERT INTO %(table)s SELECT DISTINCT(a.gene_id) FROM %(track)s_annotation as a %(joins)s LEFT JOIN %(track)s_coding AS c on c.gene_id = a.gene_id WHERE is_unknown AND is_intergenic AND exons_sum >= %(linc_min_length)i AND (c.is_coding IS NULL or not c.is_coding) AND %(wheres)s ''' % dict( PARAMS.items() + locals().items() ) E.debug( "statement to build lincRNA: %s" % statement) Database.executewait( dbhandle, statement % locals()) dbhandle.commit() cc = dbhandle.cursor() result = cc.execute("SELECT COUNT(*) FROM %(table)s" % locals() ).fetchall()[0][0] E.info( "build lincRNA set for %s: %i entries" % ( track, result )) outgtf = "%s.gtf.gz" % table E.info( "creating gtf file `%s`" % outgtf ) # output gtf file statement = '''%(cmd-sql)s %(database)s "SELECT g.* FROM %(track)s_gtf as g, %(table)s AS t WHERE t.gene_id = g.gene_id" | python %(scriptsdir)s/gtf2tsv.py --invert --log=%(outfile)s | gzip > %(outgtf)s''' P.run()
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals( ) # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)
def buildDMRStats(tables, method, outfile, dbhandle): """build dmr summary statistics. This method counts the number of up/down, 2fold up/down, etc. genes in output from (:mod:`scripts/runExpression`). This method also creates diagnostic plots in the <exportdir>/<method> directory. Tables should be labeled <tileset>_<design>_<method>. Arguments --------- tables ; list List of tables with DMR output method : string Method name outfile : string Output filename. Tab separated file summarizing """ def togeneset(tablename): return re.match("([^_]+)_", tablename).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write( "\t".join( ( "tileset", "design", "track1", "track2", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "up", "down", "twofold", "twofold_up", "twofold_down", ) ) + "\n" ) all_tables = set(Database.getTables(dbhandle)) outdir = os.path.join(PARAMS["exportdir"], "diff_methylation") for tablename in tables: prefix = P.snip(tablename, "_%s" % method) tileset, design = prefix.split("_") def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) E.info("collecting data from %s" % tablename) tested = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) status = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, status, COUNT(*) FROM %(tablename)s GROUP BY treatment_name,control_name,status""" % locals(), ).fetchall(), 3, ) signif = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE significant GROUP BY treatment_name,control_name""" % locals(), ).fetchall() ) fold2 = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE (l2fold >= 1 or l2fold <= -1) AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < 0 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2up = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold > 1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) fold2down = toDict( Database.executewait( dbhandle, """SELECT treatment_name, control_name, COUNT(*) FROM %(tablename)s WHERE l2fold < -1 AND significant GROUP BY treatment_name,control_name,significant""" % locals(), ).fetchall() ) groups = tested.keys() for treatment_name, control_name in groups: k = (treatment_name, control_name) outf.write( "\t".join( map( str, ( tileset, design, treatment_name, control_name, tested[k], "\t".join([str(status[(treatment_name, control_name, x)]) for x in keys_status]), signif[(k)], up[k], down[k], fold2[k], fold2up[k], fold2down[k], ), ) ) + "\n" ) ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, """SELECT end - start, pvalue FROM %(tablename)s WHERE significant""" % locals(), ).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(tileset)s_%(design)s_%(method)s_pvalue_vs_length.png" % locals() R.png(pngfile) R.smoothScatter( R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab="log10(length)", ylab="log10(pvalue)", log="x", pch=20, cex=0.1, ) R["dev.off"]() outf.close()
def buildExpressionStats(tables, method, outfile, outdir): '''build expression summary statistics. Creates also diagnostic plots in <exportdir>/<method> directory. ''' dbhandle = sqlite3.connect(PARAMS["database"]) def _split(tablename): # this would be much easier, if feature_counts/gene_counts/etc. # would not contain an underscore. try: design, geneset, counting_method = re.match( "([^_]+)_vs_([^_]+)_(.*)_%s" % method, tablename).groups() except AttributeError: try: design, geneset = re.match("([^_]+)_([^_]+)_%s" % method, tablename).groups() counting_method = "na" except AttributeError: raise ValueError("can't parse tablename %s" % tablename) return design, geneset, counting_method # return re.match("([^_]+)_", tablename ).groups()[0] keys_status = "OK", "NOTEST", "FAIL", "NOCALL" outf = IOTools.openFile(outfile, "w") outf.write("\t".join(("design", "geneset", "level", "treatment_name", "counting_method", "control_name", "tested", "\t".join(["status_%s" % x for x in keys_status]), "significant", "twofold")) + "\n") all_tables = set(Database.getTables(dbhandle)) for level in CUFFDIFF_LEVELS: for tablename in tables: tablename_diff = "%s_%s_diff" % (tablename, level) tablename_levels = "%s_%s_diff" % (tablename, level) design, geneset, counting_method = _split(tablename_diff) if tablename_diff not in all_tables: continue def toDict(vals, l=2): return collections.defaultdict(int, [(tuple(x[:l]), x[l]) for x in vals]) tested = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name" % locals()).fetchall()) status = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, status, " "COUNT(*) FROM %(tablename_diff)s " "GROUP BY treatment_name,control_name,status" % locals()).fetchall(), 3) signif = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE significant " "GROUP BY treatment_name,control_name" % locals()).fetchall()) fold2 = toDict( Database.executewait( dbhandle, "SELECT treatment_name, control_name, " "COUNT(*) FROM %(tablename_diff)s " "WHERE (l2fold >= 1 or l2fold <= -1) AND significant " "GROUP BY treatment_name,control_name,significant" % locals()).fetchall()) for treatment_name, control_name in tested.keys(): outf.write("\t".join( map(str, (design, geneset, level, counting_method, treatment_name, control_name, tested[ (treatment_name, control_name)], "\t".join([ str(status[(treatment_name, control_name, x)]) for x in keys_status ]), signif[(treatment_name, control_name)], fold2[(treatment_name, control_name)]))) + "\n") ########################################### ########################################### ########################################### # plot length versus P-Value data = Database.executewait( dbhandle, "SELECT i.sum, pvalue " "FROM %(tablename_diff)s, " "%(geneset)s_geneinfo as i " "WHERE i.gene_id = test_id AND " "significant" % locals()).fetchall() # require at least 10 datapoints - otherwise smooth scatter fails if len(data) > 10: data = zip(*data) pngfile = "%(outdir)s/%(design)s_%(geneset)s_%(level)s_pvalue_vs_length.png" % locals( ) R.png(pngfile) R.smoothScatter(R.log10(ro.FloatVector(data[0])), R.log10(ro.FloatVector(data[1])), xlab='log10( length )', ylab='log10( pvalue )', log="x", pch=20, cex=.1) R['dev.off']() outf.close()
def buildCuffdiffPlots(infile, outfile): '''create summaries of cufflinks results (including some diagnostic plots) Plots are created in the <exportdir>/cuffdiff directory. Plots are: <geneset>_<method>_<level>_<track1>_vs_<track2>_significance.png fold change against expression level ''' ########################################### ########################################### # create diagnostic plots ########################################### outdir = os.path.join(PARAMS["exportdir"], "cuffdiff") dbhandle = sqlite3.connect(PARAMS["database"]) prefix = P.snip(infile, ".load") geneset, method = prefix.split("_") for level in CUFFDIFF_LEVELS: tablename_diff = prefix + "_%s_diff" % level tablename_levels = prefix + "_%s_levels" % level # note that the ordering of EXPERIMENTS and the _diff table # needs to be the same as only one triangle is stored of the # pairwise results. do not plot "undefined" lfold values # (where treatment_mean or control_mean = 0) do not plot lfold # values where the confidence bounds contain 0. for track1, track2 in itertools.combinations(EXPERIMENTS, 2): statement = """ SELECT CASE WHEN d.treatment_mean < d.control_mean THEN d.treatment_mean ELSE d.control_mean END, d.l2fold, d.significant FROM %(tablename_diff)s AS d WHERE treatment_name = '%(track1)s' AND control_name = '%(track2)s' AND status = 'OK' AND treatment_mean > 0 AND control_mean > 0 """ % locals() data = zip(*Database.executewait(dbhandle, statement)) pngfile = "%(outdir)s/%(geneset)s_%(method)s_%(level)s_%(track1)s_vs_%(track2)s_significance.png" % locals() # ian: Bug fix: moved R.png to after data check so that no # plot is started if there is no data this was leading # to R falling over from too many open devices if len(data) == 0: E.warn("no plot for %s - %s -%s vs %s" % (pngfile, level, track1, track2)) continue R.png(pngfile) R.plot(ro.FloatVector(data[0]), ro.FloatVector(data[1]), xlab='min(FPKM)', ylab='log2fold', log="x", pch=20, cex=.1, col=R.ifelse(ro.IntVector(data[2]), "red", "black")) R['dev.off']() P.touch(outfile)
def defineTads(infile, outfile): ''' Motif is "forward" if present on + strand, and "reverse" if present on - strand insulator pairs (from intersection) can therefore have motifs in the following orientations: 1) convergent (F, R) 2) divergent (R, F) 3) same direction + strand (F, F) 4) same direction - strand (R, R) Intervals generated from peak intersections with motifs in convergent orientation will represent TADs (or subTADS...) ''' db = PARAMS["database"] npeaks = PARAMS["tads_npeaks"] pwidth = PARAMS["tads_pwidth"] tmp_dir = "$SCRATCH_DIR" # fetch insulator peaks with fimo motifs table = "insulators_" + '_'.join([str(npeaks), str(pwidth) ]) + "_fimo_table" statement = '''select * from %(table)s''' % locals() motifs = DB.fetch_DataFrame(statement, db) # get most significant motif for each peak motifs = motifs.sort_values(["sequence_name", "q_value"], 0).drop_duplicates(subset="sequence_name", keep="first") motifs.to_csv("insulators_fimoMotifs.txt", sep="\t", header=True) # save peaks w/ annotated motifs as df upload2csvdb(motifs, "insulators_fimoMotifs", db) # upload to csvdb # get peaks (bed format) corresponding to fimo motifs statement = '''select b.contig, b.start, b.end, a.sequence_name, b.peak_score, a.strand, a.q_value from insulators_fimoMotifs a inner join insulators b on a.sequence_name = b.peak_id''' motif_bed = DB.fetch_DataFrame(statement, db) motif_bed = motif_bed.sort_values(["sequence_name", "q_value"], 0).drop_duplicates( subset="sequence_name", keep="first") motif_bed.to_csv("motif_bed.txt", sep="\t", header=True, index=False) # merge peaks # iterate over a range of distances (1mb - 1kb) within which insulators peaks are merged ### descending order of distances -> favours bigger TADs, and joins up remaining intervals up to min. size of 1kb # merged insulators selected for with awk "," in $6 (strand col) ***Limited to merges of two peaks with F,R orientation with ($6=="\+,-") # merged insulators written to tmp file (tmp + str(counter)) # for each successive merge after n=1 peaks from previous merge are subtracted from results with bedtools (w/ -A flag to remove entire intervals) ### a few of the later TADs are ver large and are merged over previous, how to correct for this? merge final file (only overlapping tads?) # n = 0 # distances = range(0, 10100000, 10000) # 10mb to 1kb, 10kb decreases # distances = distances[::-1] # invert list -> descending order # for dist in distances: # n = n +1 # tmp = "tmp" + str(n) # if n == 1: # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - > $tmp; checkpoint; # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i $tmp | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > %(tmp)s''' % locals() # elif n > 1 and n < len(distances): # merge = tmp.replace(str(n), str(n-1)) # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint; # subtractBed -A -a $tmp -b %(merge)s > %(tmp)s''' % locals() # elif n == len(distances): # merge = tmp.replace(str(n), str(n-1)) # statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; # tail -n+2 motif_bed.txt | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -d %(dist)s -i - | # awk 'BEGIN {OFS="\\t"} {if ($6 == "\+,-") print $0}' - > $tmp; checkpoint; # subtractBed -A -a $tmp -b %(merge)s > %(tmp)s; checkpoint; # awk 'BEGIN {OFS="\\t"} {if ($3-$2 > 1000) print $0}' <(cat tmp*) | # sort -k1,1 -k2,2n - | # mergeBed -c 4,5,6,7 -o collapse,mean,collapse,mean -i - > %(outfile)s; checkpoint; # rm tmp*''' % locals() ### Instead of merging peaks with F/R motif orientation I could seperate insulator peaks into F & R files, ### then use bedtools closest to intersect peaks up to a max distance of n & remove peaks with divergent motifs. # Ensure "closest" matches are on the same chromosome with awk, also remove features > 1mb wide statement = '''tmp=`mktemp -p %(tmp_dir)s`; checkpoint; Fstrand=`mktemp -p %(tmp_dir)s`; checkpoint; Rstrand=`mktemp -p %(tmp_dir)s`; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($6 == "+") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Fstrand; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($6 == "-") print $0}' <(tail -n+2 motif_bed.txt | sort -k1,1 -k2,2n ) > $Rstrand; checkpoint; ~/devel/GIT/bedtools2/bin/closestBed -iu -D ref -a $Fstrand -b $Rstrand > $tmp; checkpoint; awk 'BEGIN {OFS="\\t"} {if ($1 == $8 && $9-$2 < 1000000) print $1,$2,$9,$4"/"$11,($5+$12)/2,$6","$13,($7+$14)/2}' $tmp > %(outfile)s ''' ### This works better! # Need to incoporate CTCF & cohesin coverage over candidate insulators. Then filter out insulator pairs (candidate TADS) with large discrepancies in ChIP signal # Czimmerer et al use a cut off of > 2fold difference betweeen start & end peaks of TADS in ChIP signal # Add ChIP coverage code for insulator peaks & save to db, then incoporate CTCF & cohesin signal into awk filter at the end of this statement print statement P.run()