Exemplo n.º 1
0
    def _markNamedEntities(self):
        """
        This method is used to define which tokens belong to _named_ entities.
        Named entities are sometimes masked when testing learning of interactions, to
        prevent the system making a trivial decision based on commonly interacting names.
        This function assumes that all given entities are named entities.
        """
        self.tokenIsName = {}
        self.tokenIsEntity = {}
        self.tokenIsEntityHead = {}
        # Initialize the dictionaries
        for token in self.tokens:
            self.tokenIsName[token] = False
            self.tokenIsEntity[token] = False
            self.tokenIsEntityHead[token] = []
        for entity in self.entities:
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            entityHeadOffset = Range.charOffsetToSingleTuple(
                entity.get("headOffset"))
            for token in self.tokens:
                tokenOffset = Range.charOffsetToSingleTuple(
                    token.get("charOffset"))
                for entityOffset in entityOffsets:
                    if Range.overlap(entityOffset, tokenOffset):
                        self.tokenIsEntity[token] = True
                        if entity.get("given") == "True":
                            self.tokenIsName[token] = True
#                        if entity.get("given") != None:
#                            if entity.get("given") == "True":
#                                self.tokenIsName[token] = True
#                        else:
#                            entity.set("given", "True")
#                            self.tokenIsName[token] = True
                if Range.overlap(entityHeadOffset, tokenOffset):
                    self.tokenIsEntityHead[token].append(entity)
Exemplo n.º 2
0
 def _markNamedEntities(self):
     """
     This method is used to define which tokens belong to _named_ entities.
     Named entities are sometimes masked when testing learning of interactions, to
     prevent the system making a trivial decision based on commonly interacting names.
     """
     self.tokenIsName = {}
     self.tokenIsEntity = {}
     self.tokenIsEntityHead = {}
     # Initialize the dictionaries
     for token in self.tokens:
         self.tokenIsName[token] = False
         self.tokenIsEntity[token] = False
         self.tokenIsEntityHead[token] = []
     for entity in self.entities:
         entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
         entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
         for token in self.tokens:
             tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
             for entityOffset in entityOffsets:
                 if Range.overlap(entityOffset, tokenOffset):
                     self.tokenIsEntity[token] = True
                     if entity.get("isName") != None:
                         if entity.get("isName") == "True":
                             self.tokenIsName[token] = True
                     else:
                         entity.set("isName", "True")
                         self.tokenIsName[token] = True
             if Range.overlap(entityHeadOffset, tokenOffset):
                 self.tokenIsEntityHead[token].append(entity)
Exemplo n.º 3
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 4
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0], entity2Range[0]),
                      max(entity1Range[1], entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 5
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(
             phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(
                         int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues:
                         features["_metamap_" + attrName + "_" +
                                  attrValue.replace(" ", "-")] = 1
Exemplo n.º 6
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities") #None
#             for entitiesElement in entitiesElements:
#                 if entitiesElement.get("source") == "SPECIES":
#                     container = entitiesElement
#                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0])
                    matchingText = sentence.get("text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText, charOffset)
                    span.set("charOffset", "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [str(x) for x in charOffset]
                    del span.attrib["offset"] #span.set("offset", "")
                    container.append(span)
Exemplo n.º 7
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 8
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Exemplo n.º 9
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Exemplo n.º 10
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k,v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Exemplo n.º 11
0
 def markNamedEntities(self, entityElements):
     """ Marks tokens belonging to named entities
     """
     namedEntityTokens = []
     for entityElement in entityElements:
         offsets = []
         offsetStrings = entityElement.attrib["charOffset"].split(",")
         for offsetString in offsetStrings:
             charFrom, charTo = offsetString.split("-")
             offset = (int(charFrom), int(charTo))
             offsets.append(offset)
         for k, v in self.tokensById.iteritems():
             for offset in offsets:
                 if Range.overlap(offset, v.charOffset):
                     v.entities.append(entityElement.attrib["id"])
                     namedEntityTokens.append(v.id)
     return namedEntityTokens
Exemplo n.º 12
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if Range.overlap(sentenceOffset, entityOffset):
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                entity.set("origOffset", entity.get("charOffset"))
                entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
            targetSentence = entSentence[interaction.get("e1")]
        else:
            targetSentence = entSentence[interaction.get("e2")]
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 13
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(
                sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities")  #None
            #             for entitiesElement in entitiesElements:
            #                 if entitiesElement.get("source") == "SPECIES":
            #                     container = entitiesElement
            #                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0],
                                  offset[1] - sentOffset[0])
                    matchingText = sentence.get(
                        "text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText,
                                                      charOffset)
                    span.set("charOffset",
                             "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [
                        str(x) for x in charOffset
                    ]
                    del span.attrib["offset"]  #span.set("offset", "")
                    container.append(span)
Exemplo n.º 14
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = [] # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens)==1: # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else: # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 15
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = []  # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens) == 1:  # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else:  # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 16
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues: 
                         features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
Exemplo n.º 17
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(
            sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id",
                               sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get(
                        "id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(
                        entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0],
                                 entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib,
                                                             entityOffsets,
                                                             sentenceOffset)
                        newEntityOffsets.append(
                            (entityOffset[0] - sentenceOffset[0],
                             entityOffset[1] - sentenceOffset[0]))
                assert len(newEntityOffsets) > 0, (entity.attrib,
                                                   entityOffsets,
                                                   sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset",
                           Range.tuplesToCharOffset(newEntityOffsets))
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception(
            "Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]

        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf",
                            interactionOldToNewId[interaction.get("siteOf")])
Exemplo n.º 18
0
    def mapInteractions(self, entityElements, interactionElements, verbose=False):
        """
        Maps the semantic interactions to the syntactic graph.
        
        Syntactic dependencies are defined between tokens. Semantic edges (interactions)
        are defined between annotated entities. To utilize the correlation of the dependency
        parse with the semantic interactions, the graphs must be aligned by mapping the
        interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This
        is done by determining the head tokens of the entities.
        
        @param entityElements: the semantic nodes (triggers and named entities)
        @type entityElements: list of cElementTree.Element objects
        @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA)
        @type interactionElements: list of cElementTree.Element objects
        @param verbose: Print selected head tokens on screen
        @param verbose: boolean
        """     
        self.interactions = interactionElements
        self.entities = entityElements
        # Entities that have no text binding can not be mapped and are therefore removed
        for entity in self.entities[:]:
            if entity.get("charOffset") == "":
                self.entities.remove(entity)
        #self.interactionGraph = NX.XDiGraph(multiedges = multiedges)
        #if multiedges:
        #    self.interactionGraph = NX10.MultiDiGraph()
        #else:
        #    self.interactionGraph = NX10.DiGraph()
        self.interactionGraph = Graph()
        self.interactionGraph.addNodes(self.tokens)
        #for token in self.tokens:
        #    self.interactionGraph.add_node(token)
        
        self.entitiesByToken = {} # a mapping for fast access
        self.entitiesById = {}
        self.entityHeadTokenByEntity = {}
        sentenceSpan = (0, len(self.sentenceElement.get("text"))) # for validating the entity offsets
        for entity in self.entities[:]:
            headToken = self.mapEntity(entity, verbose)
            if headToken != None:
                self.entityHeadTokenByEntity[entity] = headToken
                self.entitiesById[entity.get("id")] = entity
            else:
                # Check that the entity is within the sentence
                if not Range.overlap(Range.charOffsetToSingleTuple(entity.get("charOffset")), sentenceSpan):
                    raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1]) )
                # Assume there simply is no token corresponding to the entity
                self.entities.remove(entity)
        self._markNamedEntities()
        
        for interaction in self.interactions:
            if not self.entitiesById.has_key(interaction.get("e1")):
                continue # e1 is outside of this sentence
            if not self.entitiesById.has_key(interaction.get("e2")):
                continue # e2 is outside of this sentence
            token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]]
            token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]]
            
#            found = False
#            if multiedges:
#                edges = self.interactionGraph.get_edge_data(token1, token2, default={})
#                for i in range(len(edges)):
#                    edge = edges[i]["element"]
#                    if edge.attrib["type"] == interaction.attrib["type"]:
#                        found = True
#                        break
#            if not found:
#                self.interactionGraph.add_edge(token1, token2, element=interaction)
#            else:
#                self.duplicateInteractionEdgesRemoved += 1
            found = False
            edges = self.interactionGraph.getEdges(token1, token2)
            for edge in edges:
                if edge[2].get("type") == interaction.get("type"):
                    found = True
                    break
            if not found:
                self.interactionGraph.addEdge(token1, token2, interaction)
            else:
                # TODO: "skipped" would be better than "removed"
                self.duplicateInteractionEdgesRemoved += 1
Exemplo n.º 19
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
                        newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception("Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
Exemplo n.º 20
0
 def mapEntity(self, entityElement, verbose=False):
     """
     Determine the head token for a named entity or trigger. The head token is the token closest
     to the root for the subtree of the dependency parse spanned by the text of the element.
     
     @param entityElement: a semantic node (trigger or named entity)
     @type entityElement: cElementTree.Element
     @param verbose: Print selected head tokens on screen
     @param verbose: boolean
     """
     headOffset = None
     if entityElement.get("headOffset") != None:
         headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset"))
     if entityElement.get("charOffset") != "":
         charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset"))
     else:
         charOffsets = []
     # Each entity can consist of multiple syntactic tokens, covered by its
     # charOffset-range. One of these must be chosen as the head token.
     headTokens = [] # potential head tokens
     for token in self.tokens:
         #print token.attrib["id"], token.attrib["charOffset"]
         tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
         if headOffset != None and entityElement.get("type") != "Binding":
             # A head token can already be defined in the headOffset-attribute.
             # However, depending on the tokenization, even this range may
             # contain multiple tokens. Still, it can always be assumed that
             # if headOffset is defined, the corret head token is in this range.
             if Range.overlap(headOffset,tokenOffset):
                 headTokens.append(token)
         else:
             for offset in charOffsets:
                 if Range.overlap(offset,tokenOffset):
                     headTokens.append(token)
     if len(headTokens)==1: # An unambiguous head token was found
         token = headTokens[0]
     else: # One head token must be chosen from the candidates
         selHead = None
         if entityElement.get("type") == "Binding":
             for t in headTokens:
                 compText = t.get("text").lower()
                 if compText.find("bind") != -1 or compText.find("complex") != -1:
                     selHead = t
                     #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset")
                     entityElement.set("headOffset", selHead.get("charOffset"))
                     break
         if selHead == None: 
             token = self.findHeadToken(headTokens)
         else:
             token = selHead
         if verbose:
             print >> sys.stderr, "Selected head:", token.get("id"), token.get("text")
     #assert token != None, entityElement.get("id")
     if token != None:
         # The ElementTree entity-element is modified by setting the headOffset attribute
         if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"):
             entityElement.set("headOffset", token.get("charOffset"))
         if not self.entitiesByToken.has_key(token):
             self.entitiesByToken[token] = []
         self.entitiesByToken[token].append(entityElement)
     else:
         print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id")
     return token
Exemplo n.º 21
0
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        incorrectCount = 0
        sentenceText = sentence.get("text")
        tokens = tokenize(sentenceText)
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            if entity.get("type") not in entityTypes:
                continue
            headOffset = entity.get("headOffset")
            if headOffset == None:
                if verbose: print "WARNING, no head offset for entity", entity.get("id")
                headOffset = entity.get("charOffset")
            headOffset = Range.charOffsetToTuples(headOffset)[0]
            charOffset = entity.get("charOffset")
            assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id"))
            charOffset = Range.charOffsetToTuples(charOffset)[0]
            tokPos = [0,0]
            tokIndex = None
            # find main token
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token) # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex-1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex+1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex+1:endIndex+1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if                 
            # Insert changed charOffset
            counts["entities"] += 1
            newOffset = tuple(tokPos)
            newOffsetString = Range.tuplesToCharOffset([newOffset])
            if verbose:
                print "Entity", entity.get("id"), 
                #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
                print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], 
                print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")),
            if newOffset != headOffset:
                counts["extended"] += 1
                if verbose: print "EXTENDED",
            if newOffset == charOffset:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                incorrectCount += 1
                if verbose: print "INCORRECT"
            entity.set("charOffset", newOffsetString)
            #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
            entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
        if incorrectCount > 0 and verbose:
            print "TOKENS:", "|".join(tokens)
            print "--------------------------------"
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
Exemplo n.º 22
0
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens(
    )

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        incorrectCount = 0
        sentenceText = sentence.get("text")
        tokens = tokenize(sentenceText)
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            if entity.get("type") not in entityTypes:
                continue
            headOffset = entity.get("headOffset")
            if headOffset == None:
                if verbose:
                    print "WARNING, no head offset for entity", entity.get(
                        "id")
                headOffset = entity.get("charOffset")
            headOffset = Range.charOffsetToTuples(headOffset)[0]
            charOffset = entity.get("charOffset")
            assert charOffset != None, "WARNING, no head offset for entity " + str(
                entity.get("id"))
            charOffset = Range.charOffsetToTuples(charOffset)[0]
            tokPos = [0, 0]
            tokIndex = None
            # find main token
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token)  # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"),
                                      tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex - 1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens,
                                           i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(
                        tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex + 1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens,
                                               i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex + 1:endIndex + 1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if
            # Insert changed charOffset
            counts["entities"] += 1
            newOffset = tuple(tokPos)
            newOffsetString = Range.tuplesToCharOffset([newOffset])
            if verbose:
                print "Entity", entity.get("id"),
                #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
                print[
                    entity.get("text"),
                    sentenceText[headOffset[0]:headOffset[1]],
                    sentenceText[newOffset[0]:newOffset[1]]
                ],
                print[
                    entity.get("charOffset"),
                    entity.get("headOffset"), newOffsetString
                ], "Sent:", len(sentence.get("text")),
            if newOffset != headOffset:
                counts["extended"] += 1
                if verbose: print "EXTENDED",
            if newOffset == charOffset:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                incorrectCount += 1
                if verbose: print "INCORRECT"
            entity.set("charOffset", newOffsetString)
            #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
            entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
        if incorrectCount > 0 and verbose:
            print "TOKENS:", "|".join(tokens)
            print "--------------------------------"
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Exemplo n.º 23
0
    def mapInteractions(self,
                        entityElements,
                        interactionElements,
                        verbose=False):
        """
        Maps the semantic interactions to the syntactic graph.

        Syntactic dependencies are defined between tokens. Semantic edges (interactions)
        are defined between annotated entities. To utilize the correlation of the dependency
        parse with the semantic interactions, the graphs must be aligned by mapping the
        interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This
        is done by determining the head tokens of the entities.

        @param entityElements: the semantic nodes (triggers and named entities)
        @type entityElements: list of cElementTree.Element objects
        @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA)
        @type interactionElements: list of cElementTree.Element objects
        @param verbose: Print selected head tokens on screen
        @param verbose: boolean

        Duplicated interactions are skipped in this function. For all gold interactions between two tokens,
        it only keeps one interaction for each interactions type.
        """
        self.interactions = interactionElements
        self.entities = entityElements
        # Entities that have no text binding can not be mapped and are therefore removed
        for entity in self.entities[:]:
            if entity.get("charOffset") == "":
                self.entities.remove(entity)
        #self.interactionGraph = NX.XDiGraph(multiedges = multiedges)
        #if multiedges:
        #    self.interactionGraph = NX10.MultiDiGraph()
        #else:
        #    self.interactionGraph = NX10.DiGraph()
        self.interactionGraph = Graph()
        self.interactionGraph.addNodes(self.tokens)
        #for token in self.tokens:
        #    self.interactionGraph.add_node(token)

        self.entitiesByToken = {}  # a mapping for fast access
        self.entitiesById = {}
        self.entityHeadTokenByEntity = {}
        sentenceSpan = (0, len(self.sentenceElement.get("text"))
                        )  # for validating the entity offsets
        for entity in self.entities[:]:
            headToken = self.mapEntity(entity, verbose)
            if entity.tag != "entity":
                self.entities.remove(entity)
            elif headToken != None:
                self.entityHeadTokenByEntity[entity] = headToken
                self.entitiesById[entity.get("id")] = entity
            else:
                # Check that the entity is within the sentence
                if not Range.overlap(
                        Range.charOffsetToSingleTuple(
                            entity.get("charOffset")), sentenceSpan):
                    raise Exception("Entity " + entity.get("id") +
                                    ", charOffset " +
                                    entity.get("charOffset") +
                                    ", does not overlap with sentence " +
                                    self.sentenceElement.get("id") +
                                    ", length " + str(sentenceSpan[1]))
                # Assume there simply is no token corresponding to the entity
                self.entities.remove(entity)
        self._markNamedEntities()

        for interaction in self.interactions:

            if (not self.entitiesById.has_key(interaction.get("e1"))
                ):  #and self.entitiesById.has_key(interaction.get("e2")):
                continue  # e1 is outside of this sentence
                # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics
                # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
                # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]]
            if (not self.entitiesById.has_key(interaction.get("e2"))
                ):  #and self.entitiesById.has_key(interaction.get("e1")):
                continue  # e2 is outside of this sentence
                # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]]
                # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
            if self.entitiesById.has_key(
                    interaction.get("e1")) and self.entitiesById.has_key(
                        interaction.get("e2")):
                token1 = self.entityHeadTokenByEntity[self.entitiesById[
                    interaction.get("e1")]]
                token2 = self.entityHeadTokenByEntity[self.entitiesById[
                    interaction.get("e2")]]
            # else:
            #     token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
            #     token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]

#            found = False
#            if multiedges:
#                edges = self.interactionGraph.get_edge_data(token1, token2, default={})
#                for i in range(len(edges)):
#                    edge = edges[i]["element"]
#                    if edge.attrib["type"] == interaction.attrib["type"]:
#                        found = True
#                        break
#            if not found:
#                self.interactionGraph.add_edge(token1, token2, element=interaction)
#            else:
#                self.duplicateInteractionEdgesRemoved += 1
            found = False
            edges = self.interactionGraph.getEdges(token1, token2)
            for edge in edges:
                if edge[2].get("type") == interaction.get("type"):
                    found = True
                    break
            if not found:
                self.interactionGraph.addEdge(token1, token2, interaction)
            else:
                # TODO: "skipped" would be better than "removed"
                self.duplicateInteractionEdgesRemoved += 1
Exemplo n.º 24
0
    def mapEntity(self, entityElement, verbose=False):
        """
        Determine the head token for a named entity or trigger. The head token is the token closest
        to the root for the subtree of the dependency parse spanned by the text of the element.

        @param entityElement: a semantic node (trigger or named entity)
        @type entityElement: cElementTree.Element
        @param verbose: Print selected head tokens on screen
        @param verbose: boolean
        """
        headOffset = None
        if entityElement.get("headOffset") != None:
            headOffset = Range.charOffsetToSingleTuple(
                entityElement.get("headOffset"))
        if entityElement.get("charOffset") != "":
            charOffsets = Range.charOffsetToTuples(
                entityElement.get("charOffset"))
        else:
            charOffsets = []
        # Each entity can consist of multiple syntactic tokens, covered by its
        # charOffset-range. One of these must be chosen as the head token.
        headTokens = []  # potential head tokens
        for token in self.tokens:
            #print token.attrib["id"], token.attrib["charOffset"]
            tokenOffset = Range.charOffsetToSingleTuple(
                token.get("charOffset"))
            if headOffset != None and entityElement.get("type") != "Binding":
                # A head token can already be defined in the headOffset-attribute.
                # However, depending on the tokenization, even this range may
                # contain multiple tokens. Still, it can always be assumed that
                # if headOffset is defined, the corret head token is in this range.
                if Range.overlap(headOffset, tokenOffset):
                    headTokens.append(token)
            else:
                for offset in charOffsets:
                    if Range.overlap(offset, tokenOffset):
                        headTokens.append(token)
        if len(headTokens) == 1:  # An unambiguous head token was found
            token = headTokens[0]
        else:  # One head token must be chosen from the candidates
            selHead = None
            if entityElement.get("type") == "Binding":
                for t in headTokens:
                    compText = t.get("text").lower()
                    for bindWord in ("bind", "complex", "h**o", "hetero",
                                     "dimer"):
                        if bindWord in compText:
                            selHead = t
                            break
                    if selHead != None:
                        break
#                     if compText.find("bind") != -1 or compText.find("complex") != -1:
#                         selHead = t
#                         #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset")
#                         entityElement.set("headOffset", selHead.get("charOffset"))
#                         break
#             elif "egulation" in entityElement.get("type"):
#                 self.getTokenHeadScores()
#                 regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1]
#                 if len(regulationHeads) > 0:
#                     selHead = regulationHeads[-1]
            if selHead == None:
                token = self.findHeadToken(headTokens)
            else:
                token = selHead
            if verbose:
                print >> sys.stderr, "Selected head:", token.get(
                    "id"), token.get("text")
        #assert token != None, entityElement.get("id")
        if token != None:
            # The ElementTree entity-element is modified by setting the headOffset attribute
            if entityElement.get("headOffset") == None or entityElement.get(
                    "headOffset") != token.get("charOffset"):
                entityElement.set("headOffset", token.get("charOffset"))
            if not self.entitiesByToken.has_key(token):
                self.entitiesByToken[token] = []
            self.entitiesByToken[token].append(entityElement)
        else:
            print >> sys.stderr, "Warning, no tokens for entity", entityElement.get(
                "id")
        return token