def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. This function assumes that all given entities are named entities. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple( entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("given") == "True": self.tokenIsName[token] = True # if entity.get("given") != None: # if entity.get("given") == "True": # self.tokenIsName[token] = True # else: # entity.set("given", "True") # self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("isName") != None: if entity.get("isName") == "True": self.tokenIsName[token] = True else: entity.set("isName", "True") self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0], entity2Range[0]), max(entity1Range[1], entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple( phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs( int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_" + attrName + "_" + attrValue.replace(" ", "-")] = 1
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get("text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [str(x) for x in charOffset] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def getTokens(self, entity, tokenTuples): offset = entity.get("charOffset") assert offset != None offset = Range.charOffsetToSingleTuple(offset) match = [] for tokenTuple in tokenTuples: if Range.overlap(offset, tokenTuple[0]): match.append(tokenTuple[1].get("text")) elif len(match) > 0: # passed end break return match
def markNamedEntities(self, entityElements): """ Marks tokens belonging to named entities """ namedEntityTokens = [] for entityElement in entityElements: offsets = [] offsetStrings = entityElement.attrib["charOffset"].split(",") for offsetString in offsetStrings: charFrom, charTo = offsetString.split("-") offset = (int(charFrom), int(charTo)) offsets.append(offset) for k,v in self.tokensById.iteritems(): for offset in offsets: if Range.overlap(offset, v.charOffset): v.entities.append(entityElement.attrib["id"]) namedEntityTokens.append(v.id) return namedEntityTokens
def markNamedEntities(self, entityElements): """ Marks tokens belonging to named entities """ namedEntityTokens = [] for entityElement in entityElements: offsets = [] offsetStrings = entityElement.attrib["charOffset"].split(",") for offsetString in offsetStrings: charFrom, charTo = offsetString.split("-") offset = (int(charFrom), int(charTo)) offsets.append(offset) for k, v in self.tokensById.iteritems(): for offset in offsets: if Range.overlap(offset, v.charOffset): v.entities.append(entityElement.attrib["id"]) namedEntityTokens.append(v.id) return namedEntityTokens
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if Range.overlap(sentenceOffset, entityOffset): document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) entity.set("origOffset", entity.get("charOffset")) entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: targetSentence = entSentence[interaction.get("e1")] else: targetSentence = entSentence[interaction.get("e2")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get( "text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [ str(x) for x in charOffset ] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs(int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get( "id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str( entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception( "Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} sentenceSpan = (0, len(self.sentenceElement.get("text"))) # for validating the entity offsets for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: # Check that the entity is within the sentence if not Range.overlap(Range.charOffsetToSingleTuple(entity.get("charOffset")), sentenceSpan): raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1]) ) # Assume there simply is no token corresponding to the entity self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if not self.entitiesById.has_key(interaction.get("e1")): continue # e1 is outside of this sentence if not self.entitiesById.has_key(interaction.get("e2")): continue # e2 is outside of this sentence token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception("Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple(entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset,tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset,tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() if compText.find("bind") != -1 or compText.find("complex") != -1: selHead = t #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") entityElement.set("headOffset", selHead.get("charOffset")) break if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get("id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get("headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get("id") return token
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get("id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0,0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex-1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex+1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex+1:endIndex+1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens( ) if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: incorrectCount = 0 sentenceText = sentence.get("text") tokens = tokenize(sentenceText) for entity in sentence.findall("entity"): counts["all-entities"] += 1 if entity.get("type") not in entityTypes: continue headOffset = entity.get("headOffset") if headOffset == None: if verbose: print "WARNING, no head offset for entity", entity.get( "id") headOffset = entity.get("charOffset") headOffset = Range.charOffsetToTuples(headOffset)[0] charOffset = entity.get("charOffset") assert charOffset != None, "WARNING, no head offset for entity " + str( entity.get("id")) charOffset = Range.charOffsetToTuples(charOffset)[0] tokPos = [0, 0] tokIndex = None # find main token for i in range(len(tokens)): token = tokens[i] tokPos[1] = tokPos[0] + len(token) # - 1 if Range.overlap(headOffset, tokPos): tokIndex = i break tokPos[0] += len(token) assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens) skip = False if tokPos[0] < headOffset[0]: tokPos = headOffset skip = True if not skip: # Extend before beginIndex = tokIndex for i in range(tokIndex - 1, -1, -1): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): beginIndex = i + 1 break if i == 0: beginIndex = i while tokens[beginIndex].isspace() or isExtraWord( tokens[beginIndex], toLower=False): beginIndex += 1 if beginIndex >= tokIndex: beginIndex = tokIndex break # Extend after endIndex = tokIndex if tokens[tokIndex][-1] != ",": endIndex = tokIndex for i in range(tokIndex + 1, len(tokens)): token = tokens[i] if token.isspace(): continue if not isBacteriaToken(token, bacteriaTokens, i - tokIndex): endIndex = i - 1 break if i == len(tokens) - 1: endIndex = i while tokens[endIndex].isspace(): endIndex -= 1 # Modify range if tokIndex > beginIndex: for token in reversed(tokens[beginIndex:tokIndex]): tokPos[0] -= len(token) if tokIndex < endIndex: for token in tokens[tokIndex + 1:endIndex + 1]: tokPos[1] += len(token) # Attempt to remove trailing periods and commas while not sentenceText[tokPos[1] - 1].isalnum(): tokPos[1] -= 1 if tokPos[1] < tokPos[0] + 1: tokPos[1] = tokPos[0] + 1 break while not sentenceText[tokPos[0]].isalnum(): tokPos[0] += 1 if tokPos[0] >= tokPos[1]: tokPos[0] = tokPos[1] - 1 break # Split merged names #newPos = [tokPos[0], tokPos[1]] #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"): # newPos[0] += len(split) # if # Insert changed charOffset counts["entities"] += 1 newOffset = tuple(tokPos) newOffsetString = Range.tuplesToCharOffset([newOffset]) if verbose: print "Entity", entity.get("id"), #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]], print[ entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]] ], print[ entity.get("charOffset"), entity.get("headOffset"), newOffsetString ], "Sent:", len(sentence.get("text")), if newOffset != headOffset: counts["extended"] += 1 if verbose: print "EXTENDED", if newOffset == charOffset: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 incorrectCount += 1 if verbose: print "INCORRECT" entity.set("charOffset", newOffsetString) #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1]) entity.set("text", sentenceText[newOffset[0]:newOffset[1]]) if incorrectCount > 0 and verbose: print "TOKENS:", "|".join(tokens) print "--------------------------------" if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean Duplicated interactions are skipped in this function. For all gold interactions between two tokens, it only keeps one interaction for each interactions type. """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} sentenceSpan = (0, len(self.sentenceElement.get("text")) ) # for validating the entity offsets for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if entity.tag != "entity": self.entities.remove(entity) elif headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: # Check that the entity is within the sentence if not Range.overlap( Range.charOffsetToSingleTuple( entity.get("charOffset")), sentenceSpan): raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1])) # Assume there simply is no token corresponding to the entity self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if (not self.entitiesById.has_key(interaction.get("e1")) ): #and self.entitiesById.has_key(interaction.get("e2")): continue # e1 is outside of this sentence # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] if (not self.entitiesById.has_key(interaction.get("e2")) ): #and self.entitiesById.has_key(interaction.get("e1")): continue # e2 is outside of this sentence # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] if self.entitiesById.has_key( interaction.get("e1")) and self.entitiesById.has_key( interaction.get("e2")): token1 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e2")]] # else: # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1
def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple( entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples( entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset, tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() for bindWord in ("bind", "complex", "h**o", "hetero", "dimer"): if bindWord in compText: selHead = t break if selHead != None: break # if compText.find("bind") != -1 or compText.find("complex") != -1: # selHead = t # #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") # entityElement.set("headOffset", selHead.get("charOffset")) # break # elif "egulation" in entityElement.get("type"): # self.getTokenHeadScores() # regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1] # if len(regulationHeads) > 0: # selHead = regulationHeads[-1] if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get( "id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get( "headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get( "id") return token