def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv(IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" % track)] genesets.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)