Пример #1
0
        def estimateEnrichmentOfTFBS(infiles, outfile):
            '''
            Estimate the significance of trnascription factors that are
            associated with a foreground set of intervals vs a background set
            '''
            E.info("Running Fisher's exact test for TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))

            # required files
            match_table = "match_result"

            # we don't know which order the foreground and backgorund
            # will come in
            background = [
                infile for infile in infiles
                if re.search("background", infile)
            ][0]
            foreground = [
                infile for infile in infiles
                if re.search("foreground", infile)
            ][0]

            # run significance testing

            PipelineTFM.testSignificanceOfMatrices(background, foreground,
                                                   PARAMS["database"],
                                                   match_table, outfile)

            E.info("Completed Fisher's exact test for "
                   "TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))
Пример #2
0
    def matchBackgroundForSequenceComposition(infiles, outfile):
        '''
        take the background set and subset it for intervals with the same
        sequence composition distribution that is the same as the foreground
        set (for the composition statistic specified in config file).
        This requires that the background set is sufficiently large.
        '''
        # get gene set name
        track = re.match("GC_content.dir/"
                         "(.+)\.(?:background|foreground)\.gc\.load",
                         infiles[0]).groups()[0]

        # get list of foreground genes
        input_background = "%s.background.tsv" % track

        # get list of backround genes
        input_foreground = "%s.foreground.tsv" % track

        # get name of fasta file containing intervals
        fasta_file = os.path.basename(INPUT_FILE)[:-len(".gtf.gz")]
        fasta_file = os.path.join("fasta.dir", fasta_file)

        PipelineTFM.matchBgSequenceComposition(infiles,
                                               input_background,
                                               input_foreground,
                                               fasta_file,
                                               outfile,
                                               PARAMS["database"],
                                               PARAMS["genesets_header"],
                                               PARAMS["background_match_stat"])
 def calculateCpGcomposition(infiles, outfile):
     '''
     calculate the GC content for the CpG matched data
     Should be the same as the CpG content of the foreground
     set
     '''
     PipelineTransfacMatch.calculateCpGComposition(infiles[0], infiles[1], outfile)
Пример #4
0
def calculateGCContent(infiles, outfile):
    '''
    calculate the GC content across foreground and 
    background sets
    '''
    PipelineTransfacMatch.calculateCpGComposition(infiles[0], infiles[1],
                                                  outfile)
Пример #5
0
        def estimateEnrichmentOfTFBS(infiles, outfile):
            '''
            Estimate the significance of trnascription factors that are
            associated with a foreground set of intervals vs a background set
            '''
            E.info("Running Fisher's exact test for TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))

            # required files
            match_table = "match_result"

            # we don't know which order the foreground and backgorund
            # will come in
            background = [infile for infile in infiles if
                          re.search("background", infile)][0]
            foreground = [infile for infile in infiles if
                          re.search("foreground", infile)][0]

            # run significance testing

            PipelineTFM.testSignificanceOfMatrices(background,
                                                   foreground,
                                                   PARAMS["database"],
                                                   match_table,
                                                   outfile)

            E.info("Completed Fisher's exact test for "
                   "TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))
Пример #6
0
def calculateGCContent(infiles, outfile):
    '''
    calculate the GC content across foreground and 
    background sets
    '''
    PipelineTransfacMatch.calculateCpGComposition(
        infiles[0], infiles[1], outfile)
Пример #7
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Estimate the significance of transcription factors that are associated
        with a foreground set of intervals vs a background set matched for
        sequence composition.
        '''
        E.info("Running Fisher's exact test for TF enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        # required files
        match_table = "match_result"

        # we don't know which order the foreground and background will come in
        background = [infile for infile in infiles if
                      re.search("background", infile)][0]
        foreground = ["%s.foreground.tsv" %
                      re.match(".+/(.+)\.foreground\.gc\.tsv",
                               infile).groups()[0]
                      for infile in infiles if re.search("foreground",
                                                         infile)][0]
        # run significance testing
        PipelineTFM.testSignificanceOfMatrices(background,
                                               foreground,
                                               PARAMS["database"],
                                               match_table,
                                               outfile,
                                               PARAMS["genesets_header"])

        E.info("Completed Fisher's exact test for TF enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))
Пример #8
0
    def matchBackgroundForSequenceComposition(infiles, outfile):
        '''
        take the background set and subset it for intervals with the same
        sequence composition distribution that is the same as the foreground
        set (for the composition statistic specified in config file).
        This requires that the background set is sufficiently large.
        '''
        # get gene set name
        track = re.match(
            "GC_content.dir/"
            "(.+)\.(?:background|foreground)\.gc\.load",
            infiles[0]).groups()[0]

        # get list of foreground genes
        input_background = "%s.background.tsv" % track

        # get list of backround genes
        input_foreground = "%s.foreground.tsv" % track

        # get name of fasta file containing intervals
        fasta_file = os.path.basename(INPUT_FILE)[:-len(".gtf.gz")]
        fasta_file = os.path.join("fasta.dir", fasta_file)

        PipelineTFM.matchBgSequenceComposition(infiles, input_background,
                                               input_foreground, fasta_file,
                                               outfile, PARAMS["database"],
                                               PARAMS["genesets_header"],
                                               PARAMS["background_match_stat"])
Пример #9
0
 def calculateCpGcomposition(infiles, outfile):
     '''
     calculate the GC content for the CpG matched data
     Should be the same as the CpG content of the foreground
     set
     '''
     PipelineTransfacMatch.calculateCpGComposition(infiles[0], infiles[1],
                                                   outfile)
Пример #10
0
def calculateGCContent(infiles, outfile):
    '''
    calculate the GC content across foreground and background sets
    '''
    PipelineTFM.calculateSequenceComposition(infiles[0],
                                             infiles[1],
                                             outfile,
                                             PARAMS["genesets_header"])
Пример #11
0
 def calculateMatchedGCComposition(infiles, outfile):
     '''
     calculate the GC content for the CpG matched data
     Should be the same as the CpG content of the foreground
     set
     '''
     PipelineTFM.calculateSequenceComposition(infiles[0], infiles[1],
                                              outfile)
Пример #12
0
 def calculateMatchedGCComposition(infiles, outfile):
     '''
     calculate the GC content for the CpG matched data
     Should be the same as the CpG content of the foreground
     set
     '''
     PipelineTFM.calculateSequenceComposition(infiles[0],
                                              infiles[1],
                                              outfile)
def buildMatchMetrics(infile, outfile):
    '''
    match outputs transcription factors that are found in the supplied
    sequences. We are interested in the following metrics:

       * No. unique transcription factors found per sequence

       * Maximal number of TF motifs found per sequence

    '''
    tablename = filenameToTablename(os.path.basename(P.snip(infile, ".load"))) + "_result"
    PipelineTransfacMatch.frequencyMetrics(PARAMS["database"], tablename, outfile)
Пример #14
0
def buildMatchMetrics(infile, outfile):
    '''
    match outputs transcription factors that are found in the supplied
    sequences. We are interested in the following metrics:

       * No. unique transcription factors found per sequence

       * Maximal number of TF motifs found per sequence

    '''
    tablename = filenameToTablename(os.path.basename(P.snip(
        infile, ".load"))) + "_result"
    PipelineTFM.frequencyMetrics(PARAMS["database"], tablename, outfile)
Пример #15
0
    def matchBackgroundForCpGComposition(infiles, outfile):
        '''
        take the background set and subset it for intervals with
        a CpG distribution that is the same as the foreground set
        - this requires that the background set is sufficiently
        large
        '''
        track = re.match(
            "GC_content.dir/(.+)\.(?:background|foreground)\.gc\.load", infiles[0]).groups()[0]
        input_background = "%s.background.tsv" % track
        input_foreground = "%s.foreground.tsv" % track

        PipelineTransfacMatch.matchBackgroundForCpGComposition(
            infiles, input_background, input_foreground, PARAMS["database"], outfile)
Пример #16
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id,
                                       details.matrix_id,
                                       details.position,
                                       details.strand,
                                       details.core_score,
                                       details.matrix_score,
                                       details.sequence])) + "\n")
    temp.close()

    to_cluster = True
    job_options = "-l mem_free=64G"

    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --index=seq_id"
                 "  %(csv2db_options)s"
                 " < %(inf)s > %(outfile)s")
    P.run()
    os.unlink(temp.name)
Пример #17
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into 
    sqlite database
    '''
    temp = P.getTempFile()
    temp.write(
        "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n"
    )
    for details in PipelineTransfacMatch.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s
                   --log=%(outfile)s.log
                   --index=seq_id
                   %(csv2db_options)s
                   < %(inf)s > %(outfile)s'''
    P.run()
    os.remove(inf)
Пример #18
0
    def matchBackgroundForCpGComposition(infiles, outfile):
        '''
        take the background set and subset it for intervals with
        a CpG distribution that is the same as the foreground set
        - this requires that the background set is sufficiently
        large
        '''
        track = re.match(
            "GC_content.dir/(.+)\.(?:background|foreground)\.gc\.load",
            infiles[0]).groups()[0]
        input_background = "%s.background.tsv" % track
        input_foreground = "%s.foreground.tsv" % track

        PipelineTransfacMatch.matchBackgroundForCpGComposition(
            infiles, input_background, input_foreground, PARAMS["database"],
            outfile)
Пример #19
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        estimate the significance of trnascription factors that are associated with
        a foreground set of intervals vs a background set matched for CpG content
        '''
        # required files
        match_table = "match_result"

        # we don't know which order the foreground and backgorund will come in
        background = [
            infile for infile in infiles if re.search("background", infile)][0]
        foreground = ["%s.foreground.tsv" % re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0]
                      for infile in infiles if re.search("foreground", infile)][0]
        # run significance testing
        PipelineTransfacMatch.testSignificanceOfMatrices(
            background, foreground, PARAMS["database"], match_table, outfile)
Пример #20
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    to_cluster = True
    job_options = "-l mem_free=64G"

    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --index=seq_id"
                 "  %(csv2db_options)s"
                 " < %(inf)s > %(outfile)s")
    P.run()
    os.unlink(temp.name)
Пример #21
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        estimate the significance of trnascription factors that are associated with
        a foreground set of intervals vs a background set
        '''
        # required files
        match_table = "match_result"

        #we don't know which order the foreground and backgorund will come in
        background = [
            infile for infile in infiles if re.search("background", infile)
        ][0]
        foreground = [
            infile for infile in infiles if re.search("foreground", infile)
        ][0]

        # run significance testing
        PipelineTransfacMatch.testSignificanceOfMatrices(
            background, foreground, PARAMS["database"], match_table, outfile)
Пример #22
0
        def estimateEnrichmentOfTFBS(infiles, outfile):
            '''
            Estimate the significance of transcription factors that are
            associated with a foreground set of intervals vs a background
            set matched for sequence composition.
            '''
            E.info("Running Fisher's exact test for TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))

            # required files
            match_table = "match_result"

            # we don't know which order the foreground and background
            # will come in
            background = [
                infile for infile in infiles
                if re.search("background", infile)
            ][0]
            foreground = [
                "%s.foreground.tsv" %
                re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0]
                for infile in infiles if re.search("foreground", infile)
            ][0]

            # run significance testing
            # MM: added in directionality into FET - might only be looking for
            # enrichment OR depletion so don't want to hammer those p-value
            # too hard

            pval_direct = PARAMS['fisher_direction']

            PipelineTFM.testSignificanceOfMatrices(background, foreground,
                                                   PARAMS["database"],
                                                   match_table, outfile,
                                                   PARAMS["genesets_header"],
                                                   pval_direct)

            E.info("Completed Fisher's exact test for "
                   "TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))
Пример #23
0
        def estimateEnrichmentOfTFBS(infiles, outfile):
            '''
            Estimate the significance of transcription factors that are
            associated with a foreground set of intervals vs a background
            set matched for sequence composition.
            '''
            E.info("Running Fisher's exact test for TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))

            # required files
            match_table = "match_result"

            # we don't know which order the foreground and background
            # will come in
            background = [infile for infile in infiles if
                          re.search("background", infile)][0]
            foreground = ["%s.foreground.tsv" %
                          re.match(".+/(.+)\.foreground\.gc\.tsv",
                                   infile).groups()[0]
                          for infile in infiles if re.search("foreground",
                                                             infile)][0]

            # run significance testing
            # MM: added in directionality into FET - might only be looking for
            # enrichment OR depletion so don't want to hammer those p-value
            # too hard

            pval_direct = PARAMS['fisher_direction']

            PipelineTFM.testSignificanceOfMatrices(background,
                                                   foreground,
                                                   PARAMS["database"],
                                                   match_table,
                                                   outfile,
                                                   PARAMS["genesets_header"],
                                                   pval_direct)

            E.info("Completed Fisher's exact test for "
                   "TF enrichment between %s" %
                   " & ".join([os.path.basename(x) for x in infiles]))
Пример #24
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles
                      if re.search("background", inf)][0]
        foreground = [inf for inf in infiles
                      if re.search("foreground", inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing

        perms = int(PARAMS['sig_testing_nperms'])
        out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                 fg_gc=fore_gc,
                                                 bg_gc=back_gc,
                                                 nPerms=perms)

        out_frame = pandas.DataFrame(out_dict).T

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Пример #25
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles if re.search("background",
                                                          inf)][0]
        foreground = [inf for inf in infiles if re.search("foreground",
                                                          inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing

        perms = int(PARAMS['sig_testing_nperms'])
        out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                 fg_gc=fore_gc,
                                                 bg_gc=back_gc,
                                                 nPerms=perms)

        out_frame = pandas.DataFrame(out_dict).T

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Пример #26
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(
            map(str, [
                details.seq_id, details.matrix_id, details.position,
                details.strand, details.core_score, details.matrix_score,
                details.sequence
            ])) + "\n")
    temp.close()

    P.load(temp.name, outfile, options="--add-index=seq_id")
    os.unlink(temp.name)
Пример #27
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into 
    sqlite database
    '''
    temp = P.getTempFile()
    temp.write(
        "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n")
    for details in PipelineTransfacMatch.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position,
                   details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n")
    inf = temp.name
    tablename = filenameToTablename(os.path.basename(infile))
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s
                   --log=%(outfile)s.log
                   --index=seq_id
                   %(csv2db_options)s
                   < %(inf)s > %(outfile)s'''
    P.run()
    os.remove(inf)
Пример #28
0
def loadMatchResults(infile, outfile):
    '''
    load the results of the match analysis into sqlite database
    '''
    temp = P.getTempFile("./match.dir")
    temp.write("seq_id\tmatrix_id\tposition\tstrand\t"
               "core_score\tmatrix_score\tsequence\n")
    for details in PipelineTFM.match_iterator(infile):
        temp.write("\t".join(map(str, [details.seq_id,
                                       details.matrix_id,
                                       details.position,
                                       details.strand,
                                       details.core_score,
                                       details.matrix_score,
                                       details.sequence])) + "\n")
    temp.close()

    P.load(temp.name,
           outfile,
           options="--add-index=seq_id")
    os.unlink(temp.name)
Пример #29
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(sql=tfbs_state,
                                    con=dbh,
                                    index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles if re.search("background",
                                                          inf)][0]
        foreground = [inf for inf in infiles if re.search("foreground",
                                                          inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing
        # check if there are sufficient genes in the background
        # to do a permutation test.  If not, do Fishers' exact.
        # if yes, but less than specific number of permutations,
        # limit to this number.

        perms = int(PARAMS['sig_testing_nperms'])
        poss_perms = PipelineTFM.nCr(n=len(back_gc.index),
                                     r=len(fore_gc.index))
        if poss_perms < 1000:
            E.warn("Insufficient background genes to perform"
                   "permutations.  Please use Fisher's Exact test")
            raise ValueError("Insufficient background size. "
                             "Cannot use permutation test")
        elif poss_perms > 1000 and poss_perms < perms:
            E.info("Maximum possible permutations with this background"
                   " set is %i.  Running %i permutations only" % (poss_perms,
                                                                  poss_perms))
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=poss_perms,
                                                     bg_stat=PARAMS[""])
        else:
            bg_stat = PARAMS["background_match_stat"]
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=perms,
                                                     bg_stat=bg_stat)

        out_frame = pandas.DataFrame(out_dict).T
        pyadjust = R['p.adjust']
        pvs = robjects.FloatVector([p for p in out_frame['pvalue']])
        out_frame['qvalue'] = pyadjust(pvs)

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Пример #30
0
    def estimateEnrichmentOfTFBS(infiles, outfile):
        '''
        Test for enrichment of TFBS within a gene set by permutation.
        '''

        E.info("Running permutation testing for TFBS enrichment between %s" %
               " & ".join([os.path.basename(x) for x in infiles]))

        dbh = sqlite3.connect(PARAMS['database'])
        # table from sql db
        match_table = "match_result"
        tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table
        tfbs_table = pdsql.read_sql(sql=tfbs_state,
                                    con=dbh,
                                    index_col='matrix_id')

        # get foreground and background gene files
        # setup gc content dataframes

        background = [inf for inf in infiles
                      if re.search("background", inf)][0]
        foreground = [inf for inf in infiles
                      if re.search("foreground", inf)][0]

        back_gc = pandas.read_table(background,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()]
        back_gc['gene_id'] = bg_gene_id
        back_gc.index = bg_gene_id

        fore_gc = pandas.read_table(foreground,
                                    sep="\t",
                                    index_col=0,
                                    header=0)
        fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()]
        fore_gc['gene_id'] = fg_gene_id
        fore_gc.index = fg_gene_id

        # run permutation significance testing
        # check if there are sufficient genes in the background
        # to do a permutation test.  If not, do Fishers' exact.
        # if yes, but less than specific number of permutations,
        # limit to this number.

        perms = int(PARAMS['sig_testing_nperms'])
        poss_perms = PipelineTFM.nCr(n=len(back_gc.index),
                                     r=len(fore_gc.index))
        if poss_perms < 1000:
            E.warn("Insufficient background genes to perform"
                   "permutations.  Please use Fisher's Exact test")
            raise ValueError("Insufficient background size. "
                             "Cannot use permutation test")
        elif poss_perms > 1000 and poss_perms < perms:
            E.info("Maximum possible permutations with this background"
                   " set is %i.  Running %i permutations only" %
                   (poss_perms, poss_perms))
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=poss_perms,
                                                     bg_stat=PARAMS[""])
        else:
            bg_stat = PARAMS["background_match_stat"]
            out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table,
                                                     fg_gc=fore_gc,
                                                     bg_gc=back_gc,
                                                     nPerms=perms,
                                                     bg_stat=bg_stat)

        out_frame = pandas.DataFrame(out_dict).T
        pyadjust = R['p.adjust']
        pvs = robjects.FloatVector([p for p in out_frame['pvalue']])
        out_frame['qvalue'] = pyadjust(pvs)

        out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
Пример #31
0
def calculateGCContent(infiles, outfile):
    '''
    calculate the GC content across foreground and background sets
    '''
    PipelineTFM.calculateSequenceComposition(infiles[0], infiles[1], outfile,
                                             PARAMS["genesets_header"])