Exemplo n.º 1
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles
                      if re.search("background", inf)][0]
        foreground = [inf for inf in infiles
                      if re.search("foreground", inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing

        perms = int(PARAMS['sig_testing_nperms'])
        out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                 fg_gc=fore_gc,
                                                 bg_gc=back_gc,
                                                 nPerms=perms)

        out_frame = pandas.DataFrame(out_dict).T

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Exemplo n.º 2
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles if re.search("background",
                                                          inf)][0]
        foreground = [inf for inf in infiles if re.search("foreground",
                                                          inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing

        perms = int(PARAMS['sig_testing_nperms'])
        out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                 fg_gc=fore_gc,
                                                 bg_gc=back_gc,
                                                 nPerms=perms)

        out_frame = pandas.DataFrame(out_dict).T

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Exemplo n.º 3
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(sql=tfbs_state,
                                    con=dbh,
                                    index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles
                      if re.search("background", inf)][0]
        foreground = [inf for inf in infiles
                      if re.search("foreground", inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing
        # check if there are sufficient genes in the background
        # to do a permutation test.  If not, do Fishers' exact.
        # if yes, but less than specific number of permutations,
        # limit to this number.

        perms = int(PARAMS['sig_testing_nperms'])
        poss_perms = PipelineTFM.nCr(n=len(back_gc.index),
                                     r=len(fore_gc.index))
        if poss_perms < 1000:
            E.warn("Insufficient background genes to perform"
                   "permutations.  Please use Fisher's Exact test")
            raise ValueError("Insufficient background size. "
                             "Cannot use permutation test")
        elif poss_perms > 1000 and poss_perms < perms:
            E.info("Maximum possible permutations with this background"
                   " set is %i.  Running %i permutations only" %
                   (poss_perms, poss_perms))
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=poss_perms,
                                                     bg_stat=PARAMS[""])
        else:
            bg_stat = PARAMS["background_match_stat"]
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=perms,
                                                     bg_stat=bg_stat)

        out_frame = pandas.DataFrame(out_dict).T
        pyadjust = R['p.adjust']
        pvs = robjects.FloatVector([p for p in out_frame['pvalue']])
        out_frame['qvalue'] = pyadjust(pvs)

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(sql=tfbs_state,
                                    con=dbh,
                                    index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles if re.search("background",
                                                          inf)][0]
        foreground = [inf for inf in infiles if re.search("foreground",
                                                          inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing
        # check if there are sufficient genes in the background
        # to do a permutation test.  If not, do Fishers' exact.
        # if yes, but less than specific number of permutations,
        # limit to this number.

        perms = int(PARAMS['sig_testing_nperms'])
        poss_perms = PipelineTFM.nCr(n=len(back_gc.index),
                                     r=len(fore_gc.index))
        if poss_perms < 1000:
            E.warn("Insufficient background genes to perform"
                   "permutations.  Please use Fisher's Exact test")
            raise ValueError("Insufficient background size. "
                             "Cannot use permutation test")
        elif poss_perms > 1000 and poss_perms < perms:
            E.info("Maximum possible permutations with this background"
                   " set is %i.  Running %i permutations only" % (poss_perms,
                                                                  poss_perms))
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=poss_perms,
                                                     bg_stat=PARAMS[""])
        else:
            bg_stat = PARAMS["background_match_stat"]
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=perms,
                                                     bg_stat=bg_stat)

        out_frame = pandas.DataFrame(out_dict).T
        pyadjust = R['p.adjust']
        pvs = robjects.FloatVector([p for p in out_frame['pvalue']])
        out_frame['qvalue'] = pyadjust(pvs)

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')