Пример #1
0
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(IOTools.openFile(infile),
                                   index_col=0,
                                   sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" %
                                                track)]
        genesets.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" %
                                                track)]

        E.info('%s: background: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))
        backgrounds.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" %
               (track, len(genesets[-1]), len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)