示例#1
0
class MultiEdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures",
            "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features",
            "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only",
            "entity_type"
        ])
        if style == None: # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
			self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
#        if sentenceGraph.interactionGraph.has_edge(t1, t2):
#            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdges)):
#                types.add(intEdges[i]["element"].get("type"))
#        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
#            intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
#            # NOTE: Only works if keys are ordered integers
#            for i in range(len(intEdges)):
#                types.add(intEdges[i]["element"].get("type"))
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if (not directed):
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
#        interactions = []
#        e1s = [e1]
#        if duplicateEntities != None and e1 in duplicateEntities:
#            e1s += duplicateEntities[e1]
#        e2s = [e2]
#        if duplicateEntities != None and e2 in duplicateEntities:
#            e2s += duplicateEntities[e2]
#        for entity1 in e1s:
#            for entity2 in e2s:
#                interactions = interactions + sentenceGraph.getInteractions(entity1, entity2)
#                if not directed:
#                    interactions = interactions + sentenceGraph.getInteractions(entity2, entity1)
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        #print interactions
        
        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
    
    def isPotentialRELInteraction(self, e1, e2):
        if e1.get("type") == "Protein" and e2.get("type") == "Entity":
            return True
        else:
            return False

    def isPotentialBBInteraction(self, e1, e2, sentenceGraph):
        #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]:
        # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation
        if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water"]:
            return True
        elif e1.get("type") == "Host" and e2.get("type") == "HostPart":
            return True
        else:
            return False
    
    def getBISuperType(self, eType):
        if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]:
            return "ProteinEntity"
        elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]:
            return "GeneEntity"
        else:
            return None
    
    def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats):
        e1Type = e1.get("type")
        e1SuperType = self.getBISuperType(e1Type)
        e2Type = e2.get("type")
        e2SuperType = self.getBISuperType(e2Type)
        
        tag = "(" + e1Type + "/" + e2Type + ")"
        if e1Type == "Regulon":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType == "ProteinEntity":
            if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]:
                return True
        if e1Type in ["Action", "Transcription", "Expression"]:
            return True
        if e1Type == "Site":
            if e2SuperType == "GeneEntity":
                return True
        if e1Type == "Promoter":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType in ["GeneEntity", "ProteinEntity"]:
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        stats.filter("bi_limits") #+tag)
        return False

    def isPotentialEPIInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") != "Catalysis":
            if e1.get("type") in ["Protein", "Entity"]:
                return False
            elif e2.get("type") in ["Protein", "Entity"]:
                return True
            else:
                return False
        else: # Catalysis
            if e2.get("type") != "Entity":
                return True
            else:
                return False
        assert False, (e1.get("type"), e2.get("type"))

    def isPotentialIDInteraction(self, e1, e2, sentenceGraph):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        e1IsCore = e1Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
        e2IsCore = e2Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"]
        if e1IsCore:
            return False
        elif e1Type in ["Gene_expression", "Transcription"]:
            if e2Type in ["Protein", "Regulon-operon"]:
                return True
            else:
                return False
        elif e1Type in ["Protein_catabolism", "Phosphorylation"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2IsCore or e2Type == "Entity":
                return True
            else:
                return False
        elif e1Type in ["Binding", "Process"]:
            if e2IsCore:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        elif e1Type == "Entity":
            if e2IsCore:
                return True
            else:
                return False
        assert False, (e1Type, e2Type)
    
    def isPotentialCOInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") == "Exp" and e2.get("type") == "Exp":
            anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1]
            antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2]
            antecedentTokenFound = False
            for token in sentenceGraph.tokens:
                if token == antecedentTok:
                    antecedentTokenFound = True
                if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok
                    if antecedentTokenFound:
                        return True
                    else:
                        return False
            assert False
        elif e1.get("type") == "Exp" and e2.get("type") == "Protein":
            return True
        else:
            return False
    
    def isPotentialGeniaInteraction(self, e1, e2):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        if e1Type == "Protein":
            return False
        elif e1Type in ["Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2Type in ["Protein", "Entity"]:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed)
        else:
            return "neg"
                
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        
        if self.styles["trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
            
        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected
        
        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                    
                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")")
                    if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()
        
        #return examples
        return exampleIndex
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # dummy return for speed testing
        #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{})
    
        # define features
        features = {}
        if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #    path = paths[token1][token2]
            #else:
            #    path = [token1, token2]
            if not self.styles["no_path"]:
                # directedPath reduces performance by 0.01 pp
                #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2)
                #if len(directedPath) == 0:
                #    directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1)
                #    for dp in directedPath:
                #        dp.reverse()
                #if len(directedPath) == 0:
                #    path = paths.getPaths(token1, token2)
                #else:
                #    path = directedPath
                
                path = paths.getPaths(token1, token2)
                if len(path) > 0:
                    #if len(path) > 1:
                    #    print len(path)
                    path = path[0]
                    pathExists = True
                else:
                    path = [token1, token2]
                    pathExists = False
            else:
                path = [token1, token2]
                pathExists = False
            #print token1.get("id"), token2.get("id")
            assert(self.pathLengths == None)
            if self.pathLengths == None or len(path)-1 in self.pathLengths:
#                if not "no_ontology" in self.styles:
#                    self.ontologyFeatureBuilder.setFeatureVector(features)
#                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
#                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if self.styles["trigger_features"]: # F 85.52 -> 85.55
                    self.triggerFeatureBuilder.setFeatureVector(features)
                    self.triggerFeatureBuilder.tag = "trg1_"
                    self.triggerFeatureBuilder.buildFeatures(token1)
                    self.triggerFeatureBuilder.tag = "trg2_"
                    self.triggerFeatureBuilder.buildFeatures(token2)
                    self.triggerFeatureBuilder.setFeatureVector(None)
                # REL features
                if self.styles["rel_features"] and not self.styles["no_task"]:
                    self.relFeatureBuilder.setFeatureVector(features)
                    self.relFeatureBuilder.tag = "rel1_"
                    self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
                    self.relFeatureBuilder.tag = "rel2_"
                    self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
                    self.relFeatureBuilder.setFeatureVector(None)
                if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
                    self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2)
                    #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
                if self.styles["co_limits"] and not self.styles["no_task"]:
                    e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
                    e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
                    if Range.contains(e1Offset, e2Offset):
                        features[self.featureSet.getId("e1_contains_e2")] = 1
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("e1_contains_e2name")] = 1
                    if Range.contains(e2Offset, e1Offset):
                        features[self.featureSet.getId("e2_contains_e1")] = 1
                        if entity1.get("isName") == "True":
                            features[self.featureSet.getId("e2_contains_e1name")] = 1
                if self.styles["ddi_features"]:
                    self.drugFeatureBuilder.setFeatureVector(features)
                    self.drugFeatureBuilder.tag = "ddi_"
                    self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)  
                    if self.styles["ddi_mtmx"]:
                        self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
                    self.drugFeatureBuilder.setFeatureVector(None)
                #if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                #    #print "Getting edges"
                #    if token1 != token2 and pathExists:
                #        #print "g1"
                #        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                #        #print "g2"
                #    else:
                #        edges = None
                if self.styles["graph_kernel"]:
                    self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if self.styles["entity_type"]:
                    features[self.featureSet.getId("e1_"+entity1.get("type"))] = 1
                    features[self.featureSet.getId("e2_"+entity2.get("type"))] = 1
                    features[self.featureSet.getId("distance_"+str(len(path)))] = 1
                if not self.styles["no_dependency"]:
                    #print "Dep features"
                    self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not self.styles["disable_entity_features"]:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not self.styles["disable_terminus_features"]:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
                    if not self.styles["disable_single_element_features"]:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
                    if not self.styles["disable_ngram_features"]:
                        #print "NGrams"
                        self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not self.styles["disable_path_edge_features"]:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if self.styles["nodalida"]:
                    self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not self.styles["no_linear"]:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index: 
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if self.styles["random"]:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if self.styles["genia_limits"] and not self.styles["no_task"]:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert(entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId("GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId("GENIA_nested_event")] = 1
                    if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
                if self.styles["bi_limits"]:
                    # Make features based on entity types
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    e1SuperType = str(self.getBISuperType(e1Type))
                    e2SuperType = str(self.getBISuperType(e2Type))
                    features[self.featureSet.getId("BI_e1_"+e1Type)] = 1
                    features[self.featureSet.getId("BI_e2_"+e2Type)] = 1
                    features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1
                    features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1
                    features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1
                    features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
                if self.styles["evex"]:
                    self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
                    self.evexFeatureBuilder.setFeatureVector(None)
                if self.styles["giuliano"]:
                    self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2)
                    self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
                    self.giulianoFeatureBuilder.setFeatureVector(None)
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if self.styles["subset"]:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if self.styles["subset"]:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]):
        if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
            extra["deprev"] = True
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                #extra["e1GoldIds"] = mergedEntityIds[entity1]
                extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]])
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]])
                #extra["e2GoldIds"] = mergedEntityIds[entity2]
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId       
        # make example
        if self.styles["binary"]:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)
        
        # NOTE: temporarily disable for replicating 110310 experiment
        #features[self.featureSet.getId("extra_constant")] = 1
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
示例#2
0
class EntityExampleBuilder(ExampleBuilder):
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()
        
        ExampleBuilder.__init__(self, classSet, featureSet)
        assert( classSet.getId("neg") == 1 )
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName!=None:
            self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer=None
        self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", 
                                  "epi_merge_negated", "limit_merged_types", "genia_task1",
                                  "names", "build_for_nameless", "skip_for_nameless",
                                  "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", 
                                  "phospho", "drugbank_features", "ddi13_features", "metamap"])
        self.styles = self.getParameters(style)
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()
        
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens()
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
    
    def getMergedEntityType(self, entities):
        """
        If a single token belongs to multiple entities of different types,
        a new, composite type is defined. This type is the alphabetically
        ordered types of these entities joined with '---'.
        """
        types = set()
        entityIds = set()
        for entity in entities:
            if entity.get("given") == "True" and self.styles["all_tokens"]:
                continue
            if entity.get("type") == "Entity" and self.styles["genia_task1"]:
                continue
            if self.styles["epi_merge_negated"]:
                types.add(Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType(entity.get("type")))
                entityIds.add(entity.get("id"))
            else:
                types.add(entity.get("type"))
                entityIds.add(entity.get("id"))
        types = list(types)
        types.sort()
        typeString = ""
        for type in types:
            #if type == "Protein" and "all_tokens" in self.styles:
            #    continue
            if typeString != "":
                typeString += "---"
            typeString += type
        
        if typeString == "":
            return "neg", None
        
        idString = "/".join(sorted(list(entityIds)))
        
        if self.styles["limit_merged_types"]:
            if typeString.find("---") != -1:
                if typeString == "Gene_expression---Positive_regulation":
                    return typeString, idString
                else:
                    return typeString.split("---")[0], idString # ids partially incorrect
            else:
                return typeString, idString
        return typeString, idString
    
    def getMetaMapFeatures(self, token, sentenceGraph, features):
        analyses = sentenceGraph.sentenceElement.find("analyses")
        if analyses == None:
            return
        metamap = analyses.find("metamap")
        if metamap == None:
            return
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        skipAttr = set(["charOffset", "text"])
        for phrase in metamap.findall("phrase"):
            phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
            if Range.overlap(tokenOffset, phraseOffset):
                attr = phrase.attrib
                attrNames = sorted(attr.keys())
                for attrName in attrNames:
                    if attrName in skipAttr:
                        continue
                    elif attrName == "score":
                        features["_metamap_score"] = 0.001 * abs(int(attr[attrName]))
                    else:
                        attrValues = attr[attrName].split(",")
                        for attrValue in attrValues: 
                            features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
    
    def getTokenFeatures(self, token, sentenceGraph):
        """
        Returns a list of features based on the attributes of a token.
        These can be used to define more complex features.
        """
        # These features are cached when this method is first called
        # for a token.
        if self.tokenFeatures.has_key(token):
            return self.tokenFeatures[token], self.tokenFeatureWeights[token]
        tokTxt=sentenceGraph.getTokenText(token)
        features = {}
        features["_txt_"+tokTxt]=1
        features["_POS_"+token.get("POS")]=1
        if sentenceGraph.tokenIsName[token] and not self.styles["names"]:
            features["_given"]=1
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                if entity.get("given") == "True":
                    features["_annType_"+entity.get("type")]=1
        if self.styles["metamap"]:
            self.getMetaMapFeatures(token, sentenceGraph, features)
#        # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
#        if "gazetteer_features" in self.styles:
#            tokTxtLower = tokTxt.lower()
#            if "stem_gazetteer" in self.styles:
#                tokTxtLower = PorterStemmer.stem(tokTxtLower)
#            if self.gazetteer and tokTxtLower in self.gazetteer:
#                for label,weight in self.gazetteer[tokTxtLower].items():
#                    features["_knownLabel_"+label]=weight # 1 performs slightly worse
        ## BANNER features
        #if sentenceGraph.entityHintsByToken.has_key(token):
        #    features["BANNER-entity"] = 1
        # Wordnet features
        #if "wordnet" in self.styles:
        #    for wordNetFeature in self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, token.get("POS")):
        #        features["_WN_"+wordNetFeature] = 1
        self.tokenFeatures[token] = sorted(features.keys())
        self.tokenFeatureWeights[token] = features
        return self.tokenFeatures[token], self.tokenFeatureWeights[token]
    
    def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features):
        """
        Linear features are built by marking token features with a tag
        that defines their relative position in the linear order.
        """
        tag = "linear_"+tag
        tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph)
        for tokenFeature in tokenFeatures:
            features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature]
    
    def buildLinearNGram(self, i, j, sentenceGraph, features):
        ngram = "ngram"
        for index in range(i, j+1):
            ngram += "_" + sentenceGraph.getTokenText(sentenceGraph.tokens[index]).lower()
        features[self.featureSet.getId(ngram)] = 1
    
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """       
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") 
            return 0 #[]
        
        #examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}
        
        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass("GIVEN", "ENTITY"): # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]: # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]: # manually force the setting
            buildForNameless = False
        
        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("given") == "True": # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers
                return 0 #[]
            
            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)
            
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue           
            
            tokenText = token.get("text").lower()
#            if "stem_gazetteer" in self.styles:
#                tokenText = PorterStemmer.stem(tokenText)
#            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
#                features = {}
#                features[self.featureSet.getId("exclude_gazetteer")] = 1
#                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
#                if entityIds != None:
#                    extra["goldIds"] = entityIds
#                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
#                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
#                exampleIndex += 1
#                continue
            
            # FEATURES
            features = {}
            
            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
            if normalizedText == "bound": # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_"+normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_"+norStem)] = 1
            features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1
            
            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1
            
            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_"+stringLower)] = 1
                features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Linear n-grams
            if self.styles["linear_ngrams"]:
                self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features)
                self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features)
            
            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1
                
            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            
            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)
            
            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index+1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)  
                self.drugFeatureBuilder.setFeatureVector(None)
            
            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_"+wordNetFeature)] = 1
                #print
            
            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)
                             
            extra = {"xtype":"token","t":token.get("id")}
            if self.styles["bb_features"]:
                extra["trigex"] = "bb" # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi" # Request trigger type unmerging
            if entityIds != None:
                extra["goldIds"] = entityIds # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            
            # chains
            self.buildChains(token, sentenceGraph, features)
            
            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)
            
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
    
    def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None):
        if depthLeft == 0:
            return
        strDepthLeft = "dist_" + str(depthLeft)
        
        if visited == None:
            visited = set()

        inEdges = self.inEdgesByToken[token]
        outEdges = self.outEdgesByToken[token]
        edgeSet = visited.union(self.edgeSetByToken[token])
        for edge in inEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1

                nextToken = edge[0]
                tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
                for tokenFeature in tokenFeatures:
                    features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature]
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("given") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1
                
                if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]:
                    features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1
                features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1
                self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet)

        for edge in outEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1

                nextToken = edge[1]
                tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph)
                for tokenFeature in tokenFeatures:
                    features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature]
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("given") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1
                if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]:
                    features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1
                
                features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1
                self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet)
    
    def getNamedEntityHeadTokens(self, sentenceGraph):
        headTokens = []
        for entity in sentenceGraph.entities:
            if entity.get("given") == "True": # known data which can be used for features
                headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity])
        return headTokens
                
    def buildPOSPairs(self, token, namedEntityHeadTokens, features):
        tokenPOS = token.get("POS")
        assert tokenPOS != None
        for headToken in namedEntityHeadTokens:
            headPOS = headToken.get("POS")
            features[self.featureSet.getId("POS_pair_NE_"+tokenPOS+"-"+headPOS)] = 1
示例#3
0
class EdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self, style=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures
        self._setDefaultParameters([
            "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features",
            "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features",
            "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only",
            "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", 
            "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities",
            "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra",
            "entity_extra"])
        self.styles = self.getParameters(style)
        #if style == None: # no parameters given
        #    style["typed"] = style["directed"] = style["headsOnly"] = True
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["mask_nodes"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = True
        else:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if not self.styles["limit_features"]:
            self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if not self.styles["no_trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["noAnnType"]:
                self.triggerFeatureBuilder.noAnnType = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
    
    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName)                        
    
    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange
    
    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep
    
    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if not directed:
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"
        
    def getCategoryName(self, sentenceGraph, e1, e2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        if not directed and not self.styles["se10t8_undirected"]:
            interactions = interactions + sentenceGraph.getInteractions(e2, e1, True)
        
        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            if self.styles["sdb_merge"]:
                name = self.mergeForSeeDev(name, self.structureAnalyzer)
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def getBISuperType(self, eType):
        if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]:
            return "ProteinEntity"
        elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]:
            return "GeneEntity"
        else:
            return None
    
    def getSeeDevSuperTypes(self, eType):
        if eType in ("Gene", "Gene_Family", "Box", "Promoter"):
            return ("DNA", "Molecule")
        elif eType == "RNA":
            return ("RNA", "DNA_Product", "Molecule")
        elif eType in ("Protein", "Protein_Family", "Protein_Complex", "Protein_Domain"):
            return ("Amino_acid_sequence", "DNA_Product", "Molecule")
        elif eType == "Hormone":
            return ("Molecule",)
        elif eType in ("Regulatory_Network", "Pathway"):
            return ("Dynamic_process",)
        elif eType in ("Genotype", "Tissue", "Development_Phase"):
            return ("Biological_context", "Context")
        elif eType == "Environmental_Factor":
            return ("Context",)
        else:
            raise Exception("Unknown SeeDev type '" + str(eType) + "'")
    
    def mergeForSeeDev(self, categoryName, structureAnalyzer):
        if categoryName in structureAnalyzer.typeMap["forward"]:
            return structureAnalyzer.typeMap["forward"][categoryName]
        return categoryName
#         for tag in ("Regulates", "Exists", "Interacts", "Is", "Occurs"):
#             if categoryName.startswith(tag):
#                 categoryName = tag
#                 break
#         return categoryName
    
    def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None):
        if self.styles["sdb_merge"]:
            structureAnalyzer.determineNonOverlappingTypes()
            self.structureAnalyzer = structureAnalyzer
        ExampleBuilder.processCorpus(self, input, output, gold, append, allowNewIds, structureAnalyzer)
    
    def isValidInteraction(self, e1, e2, structureAnalyzer,forceUndirected=False):
        return len(structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type"), forceUndirected=forceUndirected)) > 0

    def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed)
        else:
            return "neg"
    
    def filterEdge(self, edge, edgeTypes):
        import types
        assert edgeTypes != None
        if type(edgeTypes) not in [types.ListType, types.TupleType]:
            edgeTypes = [edgeTypes]
        if edge[2].get("type") in edgeTypes:
            return True
        else:
            return False
    
    def keepExample(self, e1, e2, categoryName, isDirected, structureAnalyzer):
        makeExample = True
        if (not self.styles["no_auto_limits"]) and not self.isValidInteraction(e1, e2, structureAnalyzer, forceUndirected=not isDirected):
            makeExample = False
            self.exampleStats.filter("auto_limits")
        if self.styles["genia_task1"] and (e1.get("type") == "Entity" or e2.get("type") == "Entity"):
            makeExample = False
            self.exampleStats.filter("genia_task1")
        if self.styles["pos_only"] and categoryName == "neg":
            makeExample = False
            self.exampleStats.filter("pos_only")
        if self.styles["no_self_loops"] and ((e1 == e2) or (e1.get("headOffset") == e2.get("headOffset"))):
            makeExample = False
            self.exampleStats.filter("no_self_loops")
        return makeExample
    
    def getExampleCategoryName(self, e1=None, e2=None, t1=None, t2=None, sentenceGraph=None, goldGraph=None, entityToGold=None, isDirected=True, structureAnalyzer=None):
        if self.styles["token_nodes"]:
            categoryName = self.getCategoryNameFromTokens(sentenceGraph, t1, t2, isDirected)
        else:
            categoryName = self.getCategoryName(sentenceGraph, e1, e2, isDirected)
            if goldGraph != None:
                categoryName = self.getGoldCategoryName(goldGraph, entityToGold, e1, e2, isDirected)
        if self.styles["filter_types"] != None and categoryName in self.styles["filter_types"]:
            categoryName = "neg"
        if self.styles["se10t8_undirected"]:
            assert e1.get("id").endswith(".e1")
            assert e2.get("id").endswith(".e2")
        #if self.styles["sdb_merge"]:
        #    categoryName = self.mergeForSeeDev(categoryName, structureAnalyzer)
        return categoryName
                
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0
        # example directionality
        if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus
            examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True
        elif self.styles["directed"]:
            assert self.styles["undirected"] in [None, False]
            examplesAreDirected = True
        elif self.styles["undirected"]:
            assert self.styles["directed"] in [None, False]
            examplesAreDirected = False
        
        if not self.styles["no_trigger_features"]: 
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]: 
            self.evexFeatureBuilder.initSentence(sentenceGraph)
#         if self.styles["sdb_merge"]:
#             self.determineNonOverlappingTypes(structureAnalyzer)
            
        # Filter entities, if needed
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities))
        
        # Connect to optional gold graph
        entityToGold = None
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities)
        
        paths = None
        if not self.styles["no_path"]:
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            paths = undirected
            if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and
                paths.resetAnalyses() # just in case
                paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]})
        
        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["token_nodes"]:
            loopRange = len(sentenceGraph.tokens)
        else:
            loopRange = len(entities)
        for i in range(loopRange-1):
            for j in range(i+1,loopRange):
                eI = None
                eJ = None
                if self.styles["token_nodes"]:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                else:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get("source") != None:
                            continue
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue
                
                examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected)
                for categoryName, features, extra in examples:
                    # make example
                    if self.styles["binary"]:
                        if categoryName != "neg":
                            category = 1
                        else:
                            category = -1
                        extra["categoryName"] = "i"
                    else:
                        category = self.classSet.getId(categoryName)
                    example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra]
                    ExampleUtils.appendExamples([example], outfile)
                    exampleIndex += 1

        return exampleIndex
    
    def buildExamplesForPair(self, token1, token2, paths, sentenceGraph, goldGraph, entityToGold, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True):
        # define forward
        categoryName = self.getExampleCategoryName(entity1, entity2, token1, token2, sentenceGraph, goldGraph, entityToGold, isDirected, structureAnalyzer=structureAnalyzer)
        # make forward
        forwardExample = None
        self.exampleStats.beginExample(categoryName)
        if self.keepExample(entity1, entity2, categoryName, isDirected, structureAnalyzer):
            forwardExample = self.buildExample(token1, token2, paths, sentenceGraph, categoryName, entity1, entity2, structureAnalyzer, isDirected)
        
        if isDirected: # build a separate reverse example (if that is valid)
            self.exampleStats.endExample() # end forward example
            # define reverse
            categoryName = self.getExampleCategoryName(entity2, entity1, token2, token1, sentenceGraph, goldGraph, entityToGold, True, structureAnalyzer=structureAnalyzer)
            # make reverse
            self.exampleStats.beginExample(categoryName)
            reverseExample = None
            if self.keepExample(entity2, entity1, categoryName, True, structureAnalyzer):
                reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected)
            self.exampleStats.endExample()
            return filter(None, [forwardExample, reverseExample])
        elif self.styles["se10t8_undirected"]: # undirected example with a directed type
            self.exampleStats.endExample()
            return [forwardExample]
        elif forwardExample != None: # merge features from the reverse example to the forward one
            reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected)
            forwardExample[1].update(reverseExample[1])
            self.exampleStats.endExample() # end merged example
            return [forwardExample]
        else: # undirected example that was filtered
            self.exampleStats.endExample() # end merged example
            return []
    
    def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # define features
        if not self.styles["no_path"]:
            path = paths.getPaths(token1, token2)
            if len(path) > 0:
                path = path[0]
                #pathExists = True
            else:
                path = [token1, token2]
                #pathExists = False
        else:
            path = [token1, token2]
            #pathExists = False
        
        features = {}
        if not self.styles["no_features"]:
            features = self.buildFeatures(sentenceGraph, entity1, entity2, token1, token2, path)
        
        # define extra attributes
        if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]):
            extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")}
            extra["deprev"] = False
        else:
            extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")}
            extra["deprev"] = True
        if entity1 != None:
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]])
        if entity2 != None:
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]])
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-")
        if self.styles["doc_extra"]:
            if hasattr(sentenceGraph, "documentElement") and sentenceGraph.documentElement.get("origId") != None:
                extra["DOID"] = sentenceGraph.documentElement.get("origId")
        if self.styles["entity_extra"]:
            if entity1.get("origId") != None: extra["e1OID"] = entity1.get("origId")
            if entity2.get("origId") != None: extra["e2OID"] = entity2.get("origId")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId 
        extra["directed"] = str(isDirected)
        if self.styles["sdb_merge"]:
            extra["sdb_merge"] = "True"
            #print extra
        
        return (categoryName, features, extra)
        
    
    def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path):
        features = {} 
        if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55
            self.triggerFeatureBuilder.setFeatureVector(features)
            self.triggerFeatureBuilder.tag = "trg1_"
            self.triggerFeatureBuilder.buildFeatures(token1)
            self.triggerFeatureBuilder.tag = "trg2_"
            self.triggerFeatureBuilder.buildFeatures(token2)
            self.triggerFeatureBuilder.setFeatureVector(None)
        # REL features
        if self.styles["rel_features"] and not self.styles["no_task"]:
            self.relFeatureBuilder.setFeatureVector(features)
            self.relFeatureBuilder.tag = "rel1_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
            self.relFeatureBuilder.tag = "rel2_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
            self.relFeatureBuilder.setFeatureVector(None)
        if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
            self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2)
            #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
        if self.styles["co_features"] and not self.styles["no_task"]:
            e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
            e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
            if Range.contains(e1Offset, e2Offset):
                features[self.featureSet.getId("e1_contains_e2")] = 1
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("e1_contains_e2name")] = 1
            if Range.contains(e2Offset, e1Offset):
                features[self.featureSet.getId("e2_contains_e1")] = 1
                if entity1.get("given") == "True":
                    features[self.featureSet.getId("e2_contains_e1name")] = 1
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder.setFeatureVector(features)
            self.drugFeatureBuilder.tag = "ddi_"
            self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)  
            if self.styles["ddi_mtmx"]:
                self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
            self.drugFeatureBuilder.setFeatureVector(None)
        if self.styles["graph_kernel"]:
            self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path)
            self.graphKernelFeatureBuilder.setFeatureVector(None)
        if self.styles["entity_type"]:
            e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1)
            e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2)
            features[self.featureSet.getId("e1_"+e1Type)] = 1
            features[self.featureSet.getId("e2_"+e2Type)] = 1
            features[self.featureSet.getId("distance_"+str(len(path)))] = 1
        if not self.styles["no_dependency"]:
            #print "Dep features"
            self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
            #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
            if not self.styles["disable_entity_features"]:
                self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
            if not self.styles["disable_terminus_features"]:
                self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
            if not self.styles["disable_single_element_features"]:
                self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
            if not self.styles["disable_ngram_features"]:
                #print "NGrams"
                self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
            #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
            #if edges != None:
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
            if not self.styles["disable_path_edge_features"]:
                self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
            self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.setFeatureVector(None)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
            shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
            print shortestPaths
            if len(shortestPaths) > 0:
                self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
            self.nodalidaFeatureBuilder.setFeatureVector(None)
        if self.styles["linear_features"]:
            self.tokenFeatureBuilder.setFeatureVector(features)
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == token1:
                    token1Index = i
                if sentenceGraph.tokens[i] == token2:
                    token2Index = i
            linearPreTag = "linfw_"
            if token1Index > token2Index: 
                token1Index, token2Index = token2Index, token1Index
                linearPreTag = "linrv_"
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
            # Before, middle, after
#                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
#                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
#                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
            # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
            self.tokenFeatureBuilder.setFeatureVector(None)
        if self.styles["random"]:
            self.randomFeatureBuilder.setFeatureVector(features)
            self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
            self.randomFeatureBuilder.setFeatureVector(None)
        if self.styles["genia_features"] and not self.styles["no_task"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            assert(entity1.get("given") in (None, "False"))
            if entity2.get("given") == "True":
                features[self.featureSet.getId("GENIA_target_protein")] = 1
            else:
                features[self.featureSet.getId("GENIA_nested_event")] = 1
            if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                else:
                    features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
        if self.styles["bi_features"]:
            # Make features based on entity types
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            e1SuperType = str(self.getBISuperType(e1Type))
            e2SuperType = str(self.getBISuperType(e2Type))
            features[self.featureSet.getId("BI_e1_"+e1Type)] = 1
            features[self.featureSet.getId("BI_e2_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1
            features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1
            features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
        if self.styles["sdb_features"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1
            features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1
            features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1
            if e1Type == e2Type:
                features[self.featureSet.getId("SDB_e1e2_equal")] = 1
                features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1
            e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type))
            e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type))
            for e1SuperType in e1SuperTypes:
                for e2SuperType in e2SuperTypes:
                    features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1
                    features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1
                    features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
                    if e1SuperType == e2SuperType:
                        features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1
                        features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder.setFeatureVector(features)
            self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2)
            self.ontobiotopeFeatureBuilder.setFeatureVector(None)
        if self.styles["full_entities"]:
            e1Text = entity1.get("text").lower()
            e2Text = entity2.get("text").lower()
            features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1
            features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1
            for ep1 in e1Text.split():
                for ep2 in e2Text.split():
                    features[self.featureSet.getId("FULL_e1_"+ep1)] = 1
                    features[self.featureSet.getId("FULL_e2_"+ep2)] = 1
                    features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1
        if self.styles["evex"]:
            self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.evexFeatureBuilder.setFeatureVector(None)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2)
            self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordNetFeatureBuilder.buildPathFeatures(path)
            self.wordNetFeatureBuilder.setFeatureVector(None)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_")
            self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordVectorFeatureBuilder.buildPathFeatures(path)
            self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2))
            self.wordVectorFeatureBuilder.setFeatureVector(None)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.giulianoFeatureBuilder.setFeatureVector(None)
        
        return features
示例#4
0
class EntityExampleBuilder(ExampleBuilder):
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        assert (classSet.getId("neg") == 1)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self._setDefaultParameters([
            "rel_features", "wordnet", "bb_features", "giuliano",
            "epi_merge_negated", "limit_merged_types", "genia_task1", "names",
            "build_for_nameless", "skip_for_nameless", "pos_only",
            "all_tokens", "pos_pairs", "linear_ngrams", "phospho",
            "drugbank_features", "ddi13_features", "metamap", "only_types",
            "ontobiotope_features", "bb_spans", "wordvector", "no_context"
        ])
        self.styles = self.getParameters(style)
        #        if "selftrain_group" in self.styles:
        #            self.selfTrainGroups = set()
        #            if "selftrain_group-1" in self.styles:
        #                self.selfTrainGroups.add("-1")
        #            if "selftrain_group0" in self.styles:
        #                self.selfTrainGroups.add("0")
        #            if "selftrain_group1" in self.styles:
        #                self.selfTrainGroups.add("1")
        #            if "selftrain_group2" in self.styles:
        #                self.selfTrainGroups.add("2")
        #            if "selftrain_group3" in self.styles:
        #                self.selfTrainGroups.add("3")
        #            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(
            )
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(
                self.featureSet)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(
                featureSet, self.styles)

    def getMergedEntityType(self, entities):
        """
        If a single token belongs to multiple entities of different types,
        a new, composite type is defined. This type is the alphabetically
        ordered types of these entities joined with '---'.
        """
        types = set()
        entityIds = set()
        limitTypes = self.styles.get("only_types")
        for entity in entities:
            eType = entity.get("type")
            if entity.get("given") == "True" and self.styles["all_tokens"]:
                continue
            if eType == "Entity" and self.styles["genia_task1"]:
                continue
            if limitTypes and eType not in limitTypes:
                continue
            if self.styles["epi_merge_negated"]:
                types.add(
                    Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType(
                        eType))
                entityIds.add(entity.get("id"))
            else:
                types.add(eType)
                entityIds.add(entity.get("id"))
        types = list(types)
        types.sort()
        typeString = ""
        for type in types:
            #if type == "Protein" and "all_tokens" in self.styles:
            #    continue
            if typeString != "":
                typeString += "---"
            typeString += type

        if typeString == "":
            return "neg", None

        idString = "/".join(sorted(list(entityIds)))

        if self.styles["limit_merged_types"]:
            if typeString.find("---") != -1:
                if typeString == "Gene_expression---Positive_regulation":
                    return typeString, idString
                else:
                    return typeString.split(
                        "---")[0], idString  # ids partially incorrect
            else:
                return typeString, idString
        return typeString, idString

    def getMetaMapFeatures(self, token, sentenceGraph, features):
        analyses = sentenceGraph.sentenceElement.find("analyses")
        if analyses == None:
            return
        metamap = analyses.find("metamap")
        if metamap == None:
            return
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        skipAttr = set(["charOffset", "text"])
        for phrase in metamap.findall("phrase"):
            phraseOffset = Range.charOffsetToSingleTuple(
                phrase.get("charOffset"))
            if Range.overlap(tokenOffset, phraseOffset):
                attr = phrase.attrib
                attrNames = sorted(attr.keys())
                for attrName in attrNames:
                    if attrName in skipAttr:
                        continue
                    elif attrName == "score":
                        features["_metamap_score"] = 0.001 * abs(
                            int(attr[attrName]))
                    else:
                        attrValues = attr[attrName].split(",")
                        for attrValue in attrValues:
                            features["_metamap_" + attrName + "_" +
                                     attrValue.replace(" ", "-")] = 1

    def getTokenFeatures(self, token, sentenceGraph):
        """
        Returns a list of features based on the attributes of a token.
        These can be used to define more complex features.
        """
        # These features are cached when this method is first called
        # for a token.
        if self.tokenFeatures.has_key(token):
            return self.tokenFeatures[token], self.tokenFeatureWeights[token]
        tokTxt = sentenceGraph.getTokenText(token)
        features = {}
        features["_txt_" + tokTxt] = 1
        features["_POS_" + token.get("POS")] = 1
        if sentenceGraph.tokenIsName[token] and not self.styles["names"]:
            features["_given"] = 1
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                if entity.get("given") == "True":
                    features["_annType_" + entity.get("type")] = 1
        if self.styles["metamap"]:
            self.getMetaMapFeatures(token, sentenceGraph, features)
#        # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
#        if "gazetteer_features" in self.styles:
#            tokTxtLower = tokTxt.lower()
#            if "stem_gazetteer" in self.styles:
#                tokTxtLower = PorterStemmer.stem(tokTxtLower)
#            if self.gazetteer and tokTxtLower in self.gazetteer:
#                for label,weight in self.gazetteer[tokTxtLower].items():
#                    features["_knownLabel_"+label]=weight # 1 performs slightly worse
## BANNER features
#if sentenceGraph.entityHintsByToken.has_key(token):
#    features["BANNER-entity"] = 1
# Wordnet features
#if "wordnet" in self.styles:
#    for wordNetFeature in self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, token.get("POS")):
#        features["_WN_"+wordNetFeature] = 1
        self.tokenFeatures[token] = sorted(features.keys())
        self.tokenFeatureWeights[token] = features
        return self.tokenFeatures[token], self.tokenFeatureWeights[token]

    def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features):
        """
        Linear features are built by marking token features with a tag
        that defines their relative position in the linear order.
        """
        tag = "linear_" + tag
        tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(
            sentenceGraph.tokens[index], sentenceGraph)
        for tokenFeature in tokenFeatures:
            features[self.featureSet.getId(
                tag + tokenFeature)] = tokenFeatureWeights[tokenFeature]

    def buildLinearNGram(self, i, j, sentenceGraph, features):
        ngram = "ngram"
        for index in range(i, j + 1):
            ngram += "_" + sentenceGraph.getTokenText(
                sentenceGraph.tokens[index]).lower()
        features[self.featureSet.getId(ngram)] = 1

    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return 0  #[]

        #examples = []
        exampleIndex = 0

        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}

        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass(
                "GIVEN", "ENTITY"
        ):  # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]:  # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]:  # manually force the setting
            buildForNameless = False

        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "given"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless:  # no names, no need for triggers
                return 0  #[]

            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(
                    sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)

            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles[
                    "names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue

            tokenText = token.get("text").lower()
            #            if "stem_gazetteer" in self.styles:
            #                tokenText = PorterStemmer.stem(tokenText)
            #            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
            #                features = {}
            #                features[self.featureSet.getId("exclude_gazetteer")] = 1
            #                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
            #                if entityIds != None:
            #                    extra["goldIds"] = entityIds
            #                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            #                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
            #                exampleIndex += 1
            #                continue

            # FEATURES
            features = {}

            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1

            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_" + stringLower)] = 1
                features[self.featureSet.getId(
                    "substringstem_" + PorterStemmer.stem(stringLower))] = 1

            if not self.styles["no_context"]:
                # Linear order features
                for index in [-3, -2, -1, 1, 2, 3]:
                    if i + index > 0 and i + index < len(sentenceGraph.tokens):
                        self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                      str(index), features)

                # Linear n-grams
                if self.styles["linear_ngrams"]:
                    self.buildLinearNGram(max(0, i - 1), i, sentenceGraph,
                                          features)
                    self.buildLinearNGram(max(0, i - 2), i, sentenceGraph,
                                          features)

            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:3].lower())] = 1

            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            if not self.styles["no_context"]:
                t1InEdges = self.inEdgesByToken[token]
                for edge in t1InEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HIn_" +
                                                   edge[0].get("POS"))] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   edge[0].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[0])
                    features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1
                t1OutEdges = self.outEdgesByToken[token]
                for edge in t1OutEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HOut_" +
                                                   edge[1].get("POS"))] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   edge[1].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[1])
                    features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1

            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(
                    sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)

            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" +
                                                   str(index) + "_" +
                                                   normalizedText[:index +
                                                                  1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" +
                                                   str(index) + "_" +
                                                   normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)
                self.drugFeatureBuilder.setFeatureVector(None)

            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(
                    tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_" + wordNetFeature)] = 1
                #print

            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(
                    token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)

            if self.styles["ontobiotope_features"]:
                self.ontobiotopeFeatureBuilder.setFeatureVector(features)
                self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token)
                self.ontobiotopeFeatureBuilder.setFeatureVector(None)

            extra = {"xtype": "token", "t": token.get("id")}
            if self.styles["bb_features"]:
                extra[
                    "trigex"] = "bb"  # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi"  # Request trigger type unmerging
            if entityIds != None:
                extra[
                    "goldIds"] = entityIds  # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )

            if self.styles["bb_spans"]:
                for span in sentenceGraph.sentenceElement.iter("span"):
                    if span.get("headOffset") != token.get("charOffset"):
                        continue
                    #if span.get("source") != "spec":
                    #    continue
                    #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id")
                    features[self.featureSet.getId("span_found")] = 1
                    features[self.featureSet.getId(
                        "span_count")] = 1 + features.get(
                            self.featureSet.getId("span_count"), 0)
                    features[self.featureSet.getId("span_identifier" +
                                                   span.get("identifier"))] = 1
                    features[self.featureSet.getId("span_type" +
                                                   span.get("type"))] = 1
                    features[self.featureSet.getId("span_category" +
                                                   span.get("category"))] = 1
                    features[self.featureSet.getId("span_source" +
                                                   span.get("source"))] = 1

                    if "define_offset" in extra:
                        prevOffset = [
                            int(x) for x in extra["define_offset"].split("-")
                        ]
                        assert len(prevOffset) == 2
                        newOffset = [
                            int(x) for x in span.get("charOffset").split("-")
                        ]
                        assert len(newOffset) == 2
                        prevOffsetRange = abs(prevOffset[0] - prevOffset[1])
                        newOffsetRange = abs(newOffset[0] - newOffset[1])
                        if newOffsetRange > prevOffsetRange:
                            extra["define_offset"] = span.get("charOffset")
                    else:
                        extra["define_offset"] = span.get("charOffset")
                features[self.featureSet.getId("span_count_" + str(
                    features.get(self.featureSet.getId("span_count"), 0)))] = 1

            # chains
            if not self.styles["no_context"]:
                self.buildChains(token, sentenceGraph, features)

            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            if self.styles["wordvector"]:
                self.wordVectorFeatureBuilder.setFeatureVector(features)
                self.wordVectorFeatureBuilder.buildFeatures(token)
                self.wordVectorFeatureBuilder.setFeatureVector(None)

            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex

    def buildChains(self,
                    token,
                    sentenceGraph,
                    features,
                    depthLeft=3,
                    chain="",
                    visited=None):
        if depthLeft == 0:
            return
        strDepthLeft = "dist_" + str(depthLeft)

        if visited == None:
            visited = set()

        inEdges = self.inEdgesByToken[token]
        outEdges = self.outEdgesByToken[token]
        edgeSet = visited.union(self.edgeSetByToken[token])
        for edge in inEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_" + strDepthLeft +
                                               edgeType)] = 1

                nextToken = edge[0]
                tokenFeatures, tokenWeights = self.getTokenFeatures(
                    nextToken, sentenceGraph)
                for tokenFeature in tokenFeatures:
                    features[self.featureSet.getId(
                        strDepthLeft +
                        tokenFeature)] = tokenWeights[tokenFeature]
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("given") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1

                if sentenceGraph.tokenIsName[
                        nextToken] and not self.styles["names"]:
                    features[self.featureSet.getId("name_chain_dist_" +
                                                   strDepthLeft + chain +
                                                   "-frw_" + edgeType)] = 1
                features[self.featureSet.getId("chain_dist_" + strDepthLeft +
                                               chain + "-frw_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features,
                                 depthLeft - 1, chain + "-frw_" + edgeType,
                                 edgeSet)

        for edge in outEdges:
            if not edge in visited:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("dep_dist_" + strDepthLeft +
                                               edgeType)] = 1

                nextToken = edge[1]
                tokenFeatures, tokenWeights = self.getTokenFeatures(
                    nextToken, sentenceGraph)
                for tokenFeature in tokenFeatures:
                    features[self.featureSet.getId(
                        strDepthLeft +
                        tokenFeature)] = tokenWeights[tokenFeature]
#                for entity in sentenceGraph.tokenIsEntityHead[nextToken]:
#                    if entity.get("given") == "True":
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1
#                        features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1
#                features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1
#                tokenText = sentenceGraph.getTokenText(nextToken)
#                features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1
                if sentenceGraph.tokenIsName[
                        nextToken] and not self.styles["names"]:
                    features[self.featureSet.getId("name_chain_dist_" +
                                                   strDepthLeft + chain +
                                                   "-rev_" + edgeType)] = 1

                features[self.featureSet.getId("chain_dist_" + strDepthLeft +
                                               chain + "-rev_" + edgeType)] = 1
                self.buildChains(nextToken, sentenceGraph, features,
                                 depthLeft - 1, chain + "-rev_" + edgeType,
                                 edgeSet)

    def getNamedEntityHeadTokens(self, sentenceGraph):
        headTokens = []
        for entity in sentenceGraph.entities:
            if entity.get(
                    "given"
            ) == "True":  # known data which can be used for features
                headTokens.append(
                    sentenceGraph.entityHeadTokenByEntity[entity])
        return headTokens

    def buildPOSPairs(self, token, namedEntityHeadTokens, features):
        tokenPOS = token.get("POS")
        assert tokenPOS != None
        for headToken in namedEntityHeadTokens:
            headPOS = headToken.get("POS")
            features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" +
                                           headPOS)] = 1
class MultiEdgeExampleBuilder(ExampleBuilder):
    """
    This example builder makes edge examples, i.e. examples describing
    the event arguments.
    """
    def __init__(self,
                 style=None,
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1
                or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1))

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)

        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType",
            "noMasking", "maxFeatures", "genia_limits", "epi_limits",
            "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming",
            "trigger_features", "rel_features", "ddi_features", "evex",
            "giuliano", "random", "themeOnly", "causeOnly", "no_path",
            "entities", "skip_extra_triggers", "headsOnly", "graph_kernel",
            "trigger_features", "no_task", "no_dependency",
            "disable_entity_features", "disable_terminus_features",
            "disable_single_element_features", "disable_ngram_features",
            "disable_path_edge_features", "no_linear", "subset", "binary",
            "pos_only", "entity_type"
        ])
        if style == None:  # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
            self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(
                self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)

    def definePredictedValueRange(self, sentences, elementName):
        self.multiEdgeFeatureBuilder.definePredictedValueRange(
            sentences, elementName)

    def getPredictedValueRange(self):
        return self.multiEdgeFeatureBuilder.predictedRange

    def filterEdgesByType(self, edges, typesToInclude):
        if len(typesToInclude) == 0:
            return edges
        edgesToKeep = []
        for edge in edges:
            if edge.get("type") in typesToInclude:
                edgesToKeep.append(edge)
        return edgesToKeep

    def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        types = set()
        #        if sentenceGraph.interactionGraph.has_edge(t1, t2):
        #            intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={})
        #            # NOTE: Only works if keys are ordered integers
        #            for i in range(len(intEdges)):
        #                types.add(intEdges[i]["element"].get("type"))
        #        if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1):
        #            intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={})
        #            # NOTE: Only works if keys are ordered integers
        #            for i in range(len(intEdges)):
        #                types.add(intEdges[i]["element"].get("type"))
        intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2)
        if (not directed):
            intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(
                t2, t1)
        for intEdge in intEdges:
            types.add(intEdge[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def getCategoryName(self,
                        sentenceGraph,
                        e1,
                        e2,
                        directed=True,
                        duplicateEntities=None):
        """
        Example class. Multiple overlapping edges create a merged type.
        """
        #        interactions = []
        #        e1s = [e1]
        #        if duplicateEntities != None and e1 in duplicateEntities:
        #            e1s += duplicateEntities[e1]
        #        e2s = [e2]
        #        if duplicateEntities != None and e2 in duplicateEntities:
        #            e2s += duplicateEntities[e2]
        #        for entity1 in e1s:
        #            for entity2 in e2s:
        #                interactions = interactions + sentenceGraph.getInteractions(entity1, entity2)
        #                if not directed:
        #                    interactions = interactions + sentenceGraph.getInteractions(entity2, entity1)
        interactions = sentenceGraph.getInteractions(e1, e2, True)
        #print interactions

        types = set()
        for interaction in interactions:
            types.add(interaction[2].get("type"))
        types = list(types)
        types.sort()
        categoryName = ""
        for name in types:
            if self.styles["causeOnly"] and name != "Cause":
                continue
            if self.styles["themeOnly"] and name != "Theme":
                continue
            if categoryName != "":
                categoryName += "---"
            categoryName += name
        if categoryName != "":
            return categoryName
        else:
            return "neg"

    def isPotentialRELInteraction(self, e1, e2):
        if e1.get("type") == "Protein" and e2.get("type") == "Entity":
            return True
        else:
            return False

    def isPotentialBBInteraction(self, e1, e2, sentenceGraph):
        #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]:
        # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation
        if e1.get("type") == "Bacterium" and e2.get("type") in [
                "Host", "HostPart", "Geographical", "Environment", "Food",
                "Medical", "Soil", "Water"
        ]:
            return True
        elif e1.get("type") == "Host" and e2.get("type") == "HostPart":
            return True
        else:
            return False

    def getBISuperType(self, eType):
        if eType in [
                "GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"
        ]:
            return "ProteinEntity"
        elif eType in [
                "Gene", "GeneFamily", "GeneComplex", "Regulon", "Site",
                "Promoter"
        ]:
            return "GeneEntity"
        else:
            return None

    def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats):
        e1Type = e1.get("type")
        e1SuperType = self.getBISuperType(e1Type)
        e2Type = e2.get("type")
        e2SuperType = self.getBISuperType(e2Type)

        tag = "(" + e1Type + "/" + e2Type + ")"
        if e1Type == "Regulon":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType == "ProteinEntity":
            if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]:
                return True
        if e1Type in ["Action", "Transcription", "Expression"]:
            return True
        if e1Type == "Site":
            if e2SuperType == "GeneEntity":
                return True
        if e1Type == "Promoter":
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        if e1SuperType in ["GeneEntity", "ProteinEntity"]:
            if e2SuperType in ["GeneEntity", "ProteinEntity"]:
                return True
        stats.filter("bi_limits")  #+tag)
        return False

    def isPotentialEPIInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") != "Catalysis":
            if e1.get("type") in ["Protein", "Entity"]:
                return False
            elif e2.get("type") in ["Protein", "Entity"]:
                return True
            else:
                return False
        else:  # Catalysis
            if e2.get("type") != "Entity":
                return True
            else:
                return False
        assert False, (e1.get("type"), e2.get("type"))

    def isPotentialIDInteraction(self, e1, e2, sentenceGraph):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        e1IsCore = e1Type in [
            "Protein", "Regulon-operon", "Two-component-system", "Chemical",
            "Organism"
        ]
        e2IsCore = e2Type in [
            "Protein", "Regulon-operon", "Two-component-system", "Chemical",
            "Organism"
        ]
        if e1IsCore:
            return False
        elif e1Type in ["Gene_expression", "Transcription"]:
            if e2Type in ["Protein", "Regulon-operon"]:
                return True
            else:
                return False
        elif e1Type in ["Protein_catabolism", "Phosphorylation"]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2IsCore or e2Type == "Entity":
                return True
            else:
                return False
        elif e1Type in ["Binding", "Process"]:
            if e2IsCore:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        elif e1Type == "Entity":
            if e2IsCore:
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def isPotentialCOInteraction(self, e1, e2, sentenceGraph):
        if e1.get("type") == "Exp" and e2.get("type") == "Exp":
            anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1]
            antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2]
            antecedentTokenFound = False
            for token in sentenceGraph.tokens:
                if token == antecedentTok:
                    antecedentTokenFound = True
                if token == anaphoraTok:  # if, not elif, to take into accoutn cases where e1Tok == e2Tok
                    if antecedentTokenFound:
                        return True
                    else:
                        return False
            assert False
        elif e1.get("type") == "Exp" and e2.get("type") == "Protein":
            return True
        else:
            return False

    def isPotentialGeniaInteraction(self, e1, e2):
        e1Type = e1.get("type")
        e2Type = e2.get("type")
        if e1Type == "Protein":
            return False
        elif e1Type in [
                "Entity", "Gene_expression", "Transcription",
                "Protein_catabolism", "Phosphorylation", "Binding"
        ]:
            if e2Type == "Protein":
                return True
            else:
                return False
        elif e1Type == "Localization":
            if e2Type in ["Protein", "Entity"]:
                return True
            else:
                return False
        elif "egulation" in e1Type:
            if e2Type != "Entity":
                return True
            else:
                return False
        assert False, (e1Type, e2Type)

    def getGoldCategoryName(self,
                            goldGraph,
                            entityToGold,
                            e1,
                            e2,
                            directed=True):
        if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0:
            return self.getCategoryName(goldGraph,
                                        entityToGold[e1][0],
                                        entityToGold[e2][0],
                                        directed=directed)
        else:
            return "neg"

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build examples for a single sentence. Returns a list of examples.
        See Core/ExampleUtils for example format.
        """
        #examples = []
        exampleIndex = 0

        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder.initSentence(sentenceGraph)
        if self.styles["evex"]:
            self.evexFeatureBuilder.initSentence(sentenceGraph)

        # Filter entities, if needed
        #mergedIds = None
        #duplicateEntities = None
        #entities = sentenceGraph.entities
        #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles)
        sentenceGraph.mergeInteractionGraph(True)
        entities = sentenceGraph.mergedEntities
        entityToDuplicates = sentenceGraph.mergedEntityToDuplicates
        self.exampleStats.addValue("Duplicate entities skipped",
                                   len(sentenceGraph.entities) - len(entities))

        # Connect to optional gold graph
        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                entities, goldGraph.entities)

        paths = None
        if not self.styles["no_path"]:
            ##undirected = sentenceGraph.getUndirectedDependencyGraph()
            #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
            ###undirected = sentenceGraph.dependencyGraph.to_undirected()
            ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work
            undirected = sentenceGraph.dependencyGraph.toUndirected()
            #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
            paths = undirected

        #for edge in sentenceGraph.dependencyGraph.edges:
        #    assert edge[2] != None
        #for edge in undirected.edges:
        #    assert edge[2] != None
        #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5":
        #    print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges]

        # Generate examples based on interactions between entities or interactions between tokens
        if self.styles["entities"]:
            loopRange = len(entities)
        else:
            loopRange = len(sentenceGraph.tokens)
        for i in range(loopRange - 1):
            for j in range(i + 1, loopRange):
                eI = None
                eJ = None
                if self.styles["entities"]:
                    eI = entities[i]
                    eJ = entities[j]
                    tI = sentenceGraph.entityHeadTokenByEntity[eI]
                    tJ = sentenceGraph.entityHeadTokenByEntity[eJ]
                    #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True":
                    #    continue
                    if eI.get("type") == "neg" or eJ.get("type") == "neg":
                        continue
                    if self.styles["skip_extra_triggers"]:
                        if eI.get("source") != None or eJ.get(
                                "source") != None:
                            continue
                else:
                    tI = sentenceGraph.tokens[i]
                    tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if self.styles["headsOnly"]:
                    if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(
                            sentenceGraph.tokenIsEntityHead[tJ]) == 0):
                        continue

                if self.styles["directed"]:
                    # define forward
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eI, eJ, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, True)
                    # make forward
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eI, eJ):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eI.get("type") + "/" +
                                                     eJ.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eI, eJ, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eI, eJ, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tI, tJ, paths, sentenceGraph,
                                              categoryName, exampleIndex, eI,
                                              eJ)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()

                    # define reverse
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eJ, eI, True)
                        if goldGraph != None:
                            categoryName = self.getGoldCategoryName(
                                goldGraph, entityToGold, eJ, eI, True)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tJ, tI, True)
                    # make reverse
                    self.exampleStats.beginExample(categoryName)
                    makeExample = True
                    if self.styles[
                            "genia_limits"] and not self.isPotentialGeniaInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("genia_limits")
                    if self.styles["genia_task1"] and (
                            eI.get("type") == "Entity"
                            or eJ.get("type") == "Entity"):
                        makeExample = False
                        self.exampleStats.filter("genia_task1")
                    if self.styles[
                            "rel_limits"] and not self.isPotentialRELInteraction(
                                eJ, eI):
                        makeExample = False
                        self.exampleStats.filter("rel_limits")
                    if self.styles[
                            "co_limits"] and not self.isPotentialCOInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("co_limits")
                    if self.styles[
                            "bb_limits"] and not self.isPotentialBBInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("bb_limits")
                        if categoryName != "neg":
                            self.exampleStats.filter("bb_limits(" +
                                                     categoryName + ":" +
                                                     eJ.get("type") + "/" +
                                                     eI.get("type") + ")")
                    if self.styles[
                            "bi_limits"] and not self.isPotentialBIInteraction(
                                eJ, eI, sentenceGraph, self.exampleStats):
                        makeExample = False
                        #self.exampleStats.filter("bi_limits")
                    if self.styles[
                            "epi_limits"] and not self.isPotentialEPIInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("epi_limits")
                    if self.styles[
                            "id_limits"] and not self.isPotentialIDInteraction(
                                eJ, eI, sentenceGraph):
                        makeExample = False
                        self.exampleStats.filter("id_limits")
#                    if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_limits")
#                    if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups):
#                        makeExample = False
#                        self.exampleStats.filter("selftrain_group")
                    if self.styles["pos_only"] and categoryName == "neg":
                        makeExample = False
                        self.exampleStats.filter("pos_only")
                    if makeExample:
                        #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) )
                        ExampleUtils.appendExamples([
                            self.buildExample(tJ, tI, paths, sentenceGraph,
                                              categoryName, exampleIndex, eJ,
                                              eI)
                        ], outfile)
                        exampleIndex += 1
                    self.exampleStats.endExample()
                else:
                    if self.styles["entities"]:
                        categoryName = self.getCategoryName(
                            sentenceGraph, eI, eJ, False)
                    else:
                        categoryName = self.getCategoryNameFromTokens(
                            sentenceGraph, tI, tJ, False)
                    self.exampleStats.beginExample(categoryName)
                    forwardExample = self.buildExample(tI, tJ, paths,
                                                       sentenceGraph,
                                                       categoryName,
                                                       exampleIndex, eI, eJ)
                    if not self.styles["graph_kernel"]:
                        reverseExample = self.buildExample(
                            tJ, tI, paths, sentenceGraph, categoryName,
                            exampleIndex, eJ, eI)
                        forwardExample[2].update(reverseExample[2])
                    #examples.append(forwardExample)
                    ExampleUtils.appendExamples([forwardExample], outfile)
                    exampleIndex += 1
                    self.exampleStats.endExample()

        #return examples
        return exampleIndex

    def buildExample(self,
                     token1,
                     token2,
                     paths,
                     sentenceGraph,
                     categoryName,
                     exampleIndex,
                     entity1=None,
                     entity2=None):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # dummy return for speed testing
        #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{})

        # define features
        features = {}
        if True:  #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2):
            #    path = paths[token1][token2]
            #else:
            #    path = [token1, token2]
            if not self.styles["no_path"]:
                # directedPath reduces performance by 0.01 pp
                #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2)
                #if len(directedPath) == 0:
                #    directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1)
                #    for dp in directedPath:
                #        dp.reverse()
                #if len(directedPath) == 0:
                #    path = paths.getPaths(token1, token2)
                #else:
                #    path = directedPath

                path = paths.getPaths(token1, token2)
                if len(path) > 0:
                    #if len(path) > 1:
                    #    print len(path)
                    path = path[0]
                    pathExists = True
                else:
                    path = [token1, token2]
                    pathExists = False
            else:
                path = [token1, token2]
                pathExists = False
            #print token1.get("id"), token2.get("id")
            assert (self.pathLengths == None)
            if self.pathLengths == None or len(path) - 1 in self.pathLengths:
                #                if not "no_ontology" in self.styles:
                #                    self.ontologyFeatureBuilder.setFeatureVector(features)
                #                    self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path)
                #                    self.ontologyFeatureBuilder.setFeatureVector(None)
                if self.styles["trigger_features"]:  # F 85.52 -> 85.55
                    self.triggerFeatureBuilder.setFeatureVector(features)
                    self.triggerFeatureBuilder.tag = "trg1_"
                    self.triggerFeatureBuilder.buildFeatures(token1)
                    self.triggerFeatureBuilder.tag = "trg2_"
                    self.triggerFeatureBuilder.buildFeatures(token2)
                    self.triggerFeatureBuilder.setFeatureVector(None)
                # REL features
                if self.styles["rel_features"] and not self.styles["no_task"]:
                    self.relFeatureBuilder.setFeatureVector(features)
                    self.relFeatureBuilder.tag = "rel1_"
                    self.relFeatureBuilder.buildAllFeatures(
                        sentenceGraph.tokens,
                        sentenceGraph.tokens.index(token1))
                    self.relFeatureBuilder.tag = "rel2_"
                    self.relFeatureBuilder.buildAllFeatures(
                        sentenceGraph.tokens,
                        sentenceGraph.tokens.index(token2))
                    self.relFeatureBuilder.setFeatureVector(None)
                if self.styles[
                        "bacteria_renaming"] and not self.styles["no_task"]:
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(
                        features)
                    self.bacteriaRenamingFeatureBuilder.buildPairFeatures(
                        entity1, entity2)
                    #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
                    self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
                if self.styles["co_limits"] and not self.styles["no_task"]:
                    e1Offset = Range.charOffsetToSingleTuple(
                        entity1.get("charOffset"))
                    e2Offset = Range.charOffsetToSingleTuple(
                        entity2.get("charOffset"))
                    if Range.contains(e1Offset, e2Offset):
                        features[self.featureSet.getId("e1_contains_e2")] = 1
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "e1_contains_e2name")] = 1
                    if Range.contains(e2Offset, e1Offset):
                        features[self.featureSet.getId("e2_contains_e1")] = 1
                        if entity1.get("isName") == "True":
                            features[self.featureSet.getId(
                                "e2_contains_e1name")] = 1
                if self.styles["ddi_features"]:
                    self.drugFeatureBuilder.setFeatureVector(features)
                    self.drugFeatureBuilder.tag = "ddi_"
                    self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)
                    if self.styles["ddi_mtmx"]:
                        self.drugFeatureBuilder.buildMTMXFeatures(
                            entity1, entity2)
                    self.drugFeatureBuilder.setFeatureVector(None)
                #if "graph_kernel" in self.styles or not "no_dependency" in self.styles:
                #    #print "Getting edges"
                #    if token1 != token2 and pathExists:
                #        #print "g1"
                #        edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path)
                #        #print "g2"
                #    else:
                #        edges = None
                if self.styles["graph_kernel"]:
                    self.graphKernelFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.graphKernelFeatureBuilder.buildGraphKernelFeatures(
                        sentenceGraph, path)
                    self.graphKernelFeatureBuilder.setFeatureVector(None)
                if self.styles["entity_type"]:
                    features[self.featureSet.getId("e1_" +
                                                   entity1.get("type"))] = 1
                    features[self.featureSet.getId("e2_" +
                                                   entity2.get("type"))] = 1
                    features[self.featureSet.getId("distance_" +
                                                   str(len(path)))] = 1
                if not self.styles["no_dependency"]:
                    #print "Dep features"
                    self.multiEdgeFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
                    if not self.styles["disable_entity_features"]:
                        self.multiEdgeFeatureBuilder.buildEntityFeatures(
                            sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
                    if not self.styles["disable_terminus_features"]:
                        self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                            path, sentenceGraph)  # remove for fast
                    if not self.styles["disable_single_element_features"]:
                        self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                            path, sentenceGraph)
                    if not self.styles["disable_ngram_features"]:
                        #print "NGrams"
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            2, path, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            3, path, sentenceGraph)  # remove for fast
                        self.multiEdgeFeatureBuilder.buildPathGrams(
                            4, path, sentenceGraph)  # remove for fast
                    #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
                    #if edges != None:
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
                    #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
                    if not self.styles["disable_path_edge_features"]:
                        self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                            path, sentenceGraph)
                    self.multiEdgeFeatureBuilder.buildSentenceFeatures(
                        sentenceGraph)
                    self.multiEdgeFeatureBuilder.setFeatureVector(None)
                if self.styles["nodalida"]:
                    self.nodalidaFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(
                        sentenceGraph.dependencyGraph, path)
                    print shortestPaths
                    if len(shortestPaths) > 0:
                        self.nodalidaFeatureBuilder.buildNGrams(
                            shortestPaths, sentenceGraph)
                    self.nodalidaFeatureBuilder.setFeatureVector(None)
                if not self.styles["no_linear"]:
                    self.tokenFeatureBuilder.setFeatureVector(features)
                    for i in range(len(sentenceGraph.tokens)):
                        if sentenceGraph.tokens[i] == token1:
                            token1Index = i
                        if sentenceGraph.tokens[i] == token2:
                            token2Index = i
                    linearPreTag = "linfw_"
                    if token1Index > token2Index:
                        token1Index, token2Index = token2Index, token1Index
                        linearPreTag = "linrv_"
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token1Index, sentenceGraph, 2, 2, preTag="linTok1")
                    self.tokenFeatureBuilder.buildLinearOrderFeatures(
                        token2Index, sentenceGraph, 2, 2, preTag="linTok2")
                    # Before, middle, after
                    #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
                    #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
                    # before-middle, middle, middle-after
                    #                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
                    #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
                    self.tokenFeatureBuilder.setFeatureVector(None)
                if self.styles["random"]:
                    self.randomFeatureBuilder.setFeatureVector(features)
                    self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
                    self.randomFeatureBuilder.setFeatureVector(None)
                if self.styles["genia_limits"] and not self.styles["no_task"]:
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    assert (entity1.get("isName") == "False")
                    if entity2.get("isName") == "True":
                        features[self.featureSet.getId(
                            "GENIA_target_protein")] = 1
                    else:
                        features[self.featureSet.getId(
                            "GENIA_nested_event")] = 1
                    if e1Type.find(
                            "egulation"
                    ) != -1:  # leave r out to avoid problems with capitalization
                        if entity2.get("isName") == "True":
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_protein")] = 1
                        else:
                            features[self.featureSet.getId(
                                "GENIA_regulation_of_event")] = 1
                if self.styles["bi_limits"]:
                    # Make features based on entity types
                    e1Type = entity1.get("type")
                    e2Type = entity2.get("type")
                    e1SuperType = str(self.getBISuperType(e1Type))
                    e2SuperType = str(self.getBISuperType(e2Type))
                    features[self.featureSet.getId("BI_e1_" + e1Type)] = 1
                    features[self.featureSet.getId("BI_e2_" + e2Type)] = 1
                    features[self.featureSet.getId("BI_e1sup_" +
                                                   e1SuperType)] = 1
                    features[self.featureSet.getId("BI_e2sup_" +
                                                   e2SuperType)] = 1
                    features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" +
                                                   e2Type)] = 1
                    features[self.featureSet.getId("BI_e1e2sup_" +
                                                   e1SuperType + "_" +
                                                   e2SuperType)] = 1
                if self.styles["evex"]:
                    self.evexFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.evexFeatureBuilder.buildEdgeFeatures(
                        entity1, entity2, token1, token2, path, sentenceGraph)
                    self.evexFeatureBuilder.setFeatureVector(None)
                if self.styles["giuliano"]:
                    self.giulianoFeatureBuilder.setFeatureVector(
                        features, entity1, entity2)
                    self.giulianoFeatureBuilder.buildEdgeFeatures(
                        entity1, entity2, token1, token2, path, sentenceGraph)
                    self.giulianoFeatureBuilder.setFeatureVector(None)
            else:
                features[self.featureSet.getId("always_negative")] = 1
                if self.styles["subset"]:
                    features[self.featureSet.getId("out_of_scope")] = 1
        else:
            features[self.featureSet.getId("always_negative")] = 1
            if self.styles["subset"]:
                features[self.featureSet.getId("out_of_scope")] = 1
            path = [token1, token2]
        # define extra attributes
        #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]):
        if int(path[0].get("charOffset").split("-")[0]) < int(
                path[-1].get("charOffset").split("-")[0]):
            #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[0].get("id"),
                "t2": path[-1].get("id")
            }
            extra["deprev"] = False
        else:
            #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[-1].get("id"),
                "t2": path[0].get("id")
            }
            extra["deprev"] = True
        if entity1 != None:
            #extra["e1"] = entity1
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                #extra["e1GoldIds"] = mergedEntityIds[entity1]
                extra["e1DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity1]
                ])
        if entity2 != None:
            #extra["e2"] = entity2
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity2]
                ])
                #extra["e2GoldIds"] = mergedEntityIds[entity2]
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        # make example
        if self.styles["binary"]:
            if categoryName != "neg":
                category = 1
            else:
                category = -1
            categoryName = "i"
        else:
            category = self.classSet.getId(categoryName)

        # NOTE: temporarily disable for replicating 110310 experiment
        #features[self.featureSet.getId("extra_constant")] = 1
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)