def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing perms = int(PARAMS['sig_testing_nperms']) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms) out_frame = pandas.DataFrame(out_dict).T out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing perms = int(PARAMS['sig_testing_nperms']) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms) out_frame = pandas.DataFrame(out_dict).T out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(sql=tfbs_state, con=dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing # check if there are sufficient genes in the background # to do a permutation test. If not, do Fishers' exact. # if yes, but less than specific number of permutations, # limit to this number. perms = int(PARAMS['sig_testing_nperms']) poss_perms = PipelineTFM.nCr(n=len(back_gc.index), r=len(fore_gc.index)) if poss_perms < 1000: E.warn("Insufficient background genes to perform" "permutations. Please use Fisher's Exact test") raise ValueError("Insufficient background size. " "Cannot use permutation test") elif poss_perms > 1000 and poss_perms < perms: E.info("Maximum possible permutations with this background" " set is %i. Running %i permutations only" % (poss_perms, poss_perms)) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=poss_perms, bg_stat=PARAMS[""]) else: bg_stat = PARAMS["background_match_stat"] out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms, bg_stat=bg_stat) out_frame = pandas.DataFrame(out_dict).T pyadjust = R['p.adjust'] pvs = robjects.FloatVector([p for p in out_frame['pvalue']]) out_frame['qvalue'] = pyadjust(pvs) out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(sql=tfbs_state, con=dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing # check if there are sufficient genes in the background # to do a permutation test. If not, do Fishers' exact. # if yes, but less than specific number of permutations, # limit to this number. perms = int(PARAMS['sig_testing_nperms']) poss_perms = PipelineTFM.nCr(n=len(back_gc.index), r=len(fore_gc.index)) if poss_perms < 1000: E.warn("Insufficient background genes to perform" "permutations. Please use Fisher's Exact test") raise ValueError("Insufficient background size. " "Cannot use permutation test") elif poss_perms > 1000 and poss_perms < perms: E.info("Maximum possible permutations with this background" " set is %i. Running %i permutations only" % (poss_perms, poss_perms)) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=poss_perms, bg_stat=PARAMS[""]) else: bg_stat = PARAMS["background_match_stat"] out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms, bg_stat=bg_stat) out_frame = pandas.DataFrame(out_dict).T pyadjust = R['p.adjust'] pvs = robjects.FloatVector([p for p in out_frame['pvalue']]) out_frame['qvalue'] = pyadjust(pvs) out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')