예제 #1
0
파일: sdmsegs.py 프로젝트: anzev/sdmtoolkit
    def run(self, 
            inputData,           # List of the form [..., (id_i, rank_i or label_i), ...] or str.
            mapping,             # List of the form [..., (id_i, URI1, URI2, ...), ...] where id_i is annotated with with the listed URI's or str.
            ont1,                # OWL ontologies as strings 
            ont2 = None, 
            ont3 = None, 
            ont4 = None,
            interactions = [],        # List of the form [..., (id_i, id_j), ...] where id_i interacts with id_j or str.
            generalTerms = [],
            legacy = False,
            posClassVal = None, 
            cutoff = None, 
            wracc_k = defaults[WRACC_K], 
            minimalSetSize = defaults[MIN_SET_SIZE],
            maxNumTerms = defaults[MAX_NUM_TERMS],
            maxReported = defaults[MAX_REPORTED],
            maximalPvalue = defaults[MAX_P_VALUE],
            weightFisher = defaults[WEIGHT_FISHER],
            weightGSEA = defaults[WEIGHT_GSEA],
            weightPAGE = defaults[WEIGHT_PAGE],
            summarizeDescriptions = defaults[SUMMARIZE],
            randomSeed = defaults[RANDOM_SEED],
            level_ont1 = defaults[LEVEL_ONT1],
            level_ont2 = defaults[LEVEL_ONT2],
            level_ont3 = defaults[LEVEL_ONT3],
            level_ont4 = defaults[LEVEL_ONT4],
            dataFormat = StructuredFormat.FORMAT_TAB,
            progressFname = 'progress.txt',   
            ):

        logger.info("Starting SDM-SEGS.")
        
        # Check if we have properly structured inputs or strings
        if type(inputData) in [str, unicode]:
            inputData = StructuredFormat.parseInput(inputData, dataFormat)
        if type(interactions) in [str, unicode]:
            interactions = StructuredFormat.parseInteractions(interactions)
        if type(mapping) in [str, unicode]:
            mapping = StructuredFormat.parseMapping(mapping)
        if type(generalTerms) in [str, unicode]:
            generalTerms = StructuredFormat.parseGeneralTerms(generalTerms)
        if posClassVal:
            # Labelled data
            pos, neg = [], []
            # Assure pos class instances are listed first.
            for iid, label in inputData:
                if label == posClassVal:
                    pos.append((iid, label))
                else:
                    neg.append((iid, label))
            cutoff = len(pos)
            pos.extend(neg)
            data = [[], []]
            for iid, label in pos:
                data[0].append(int(iid))
                data[1].append(0.5)
        else:
            # Assume ranked data
            if not cutoff:
                raise MissingParameterException("Cutoff needs to be specified for ranked data by the user!")
            data = [[], []]
            for iid, rank in inputData:
                data[0].append(int(iid))
                data[1].append(rank)
        inputData = data
        # Parse interactions
        idToList = dict()
        for id1, id2 in interactions:
            if not idToList.has_key(id1):
                idToList[id1] = []
            idToList[id1].append(id2)
        g2g = []
        for iid, idList in sorted(idToList.items(), key=lambda p: p[0]):
            g2g.append([iid, idList])
        if not legacy:
            import segs
            ont, g2ont = OWL2X.get_segs_input(filter(None, [ont1, ont2, ont3, ont4]), mapping)
            numOfOnt = len(filter(None, [ont1, ont2, ont3, ont4]))
        else:
            import segs_legacy as segs
            # Legacy input of segs - we assume it is already properly formatted
            g2ont = []
            for entry in mapping:
                g2ont.append([entry[0], entry[1]])
            ont = []
            for entry in StringIO.StringIO(ont1):
                ont.append(eval(entry))
            numOfOnt = 4
        # Create a map from go terms to human-readable descriptions
        ontDict = dict()
        for entry in ont:
            goID = entry[0]
            name = entry[1][1]
            ontDict[goID] = name     
        logger.info("Running SEGS subsystem.")        
        segs_result = segs.runSEGS(
            generalTerms = generalTerms,
            ontology = ont,
            g2g = g2g,
            g2ont = g2ont,
            progressFname = progressFname,
            inputData = inputData,
            useMolFunctions = True,
            useBioProcesses = numOfOnt > 1,
            useCellComponents = numOfOnt > 2,
            useKEGG = numOfOnt > 3,
            useGeneInteractions = 1 if interactions else 0,
            summarize = summarizeDescriptions,
            cutoff = cutoff,
            minSizeGS = minimalSetSize,
            maxNumTerms = maxNumTerms,
            GSEAfactor = SDMSEGS.locked[SDMSEGS.GSEA_FACTOR],
            numIters = 0,
            PrintTopGS = maxReported,
            p_value = maximalPvalue if legacy else 1,
            weightFisher = weightFisher,
            weightGSEA = weightGSEA,
            weightPAGE = weightPAGE,
            randomSeed = randomSeed,
            wracc_k = wracc_k,
            level_ont1 = level_ont1,
            level_ont2 = level_ont2,
            level_ont3 = level_ont3,
            level_ont4 = level_ont4)
        del segs
        logger.info("SDM-SEGS finished.")
        rules = []
        for _, segs_rule in segs_result['A']['WRAcc'].items():
            if segs_rule['scores']['wracc'] <= 0:
                continue
            rule = {
                'support' : segs_rule['topGenes'],
                'coverage' : segs_rule['allGenes'],
                'scores' : segs_rule['scores'],
                'terms' : [ontDict[term] for term in segs_rule['terms'] if isinstance(term, str)],
                'interacting_terms' : [ontDict[term] for term_list in segs_rule['terms'] if isinstance(term_list, list) for term in term_list],
            }
            rules.append(rule)
        return rules