def makeINSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) prevToken = None tokCount = 0 for token in tokens[phraseBegin:phraseEnd + 1]: if token.get("POS") == "IN" and prevToken != None: newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple( prevToken.get("charOffset"))[-1]) newPhrase = makePhrase( phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1) if not phraseDict.has_key(newPhraseOffset): #print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] prevToken = token tokCount += 1 return newPhrases
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("isName") != None: if entity.get("isName") == "True": self.tokenIsName[token] = True else: entity.set("isName", "True") self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple( phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs( int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_" + attrName + "_" + attrValue.replace(" ", "-")] = 1
def _markNamedEntities(self): """ This method is used to define which tokens belong to _named_ entities. Named entities are sometimes masked when testing learning of interactions, to prevent the system making a trivial decision based on commonly interacting names. This function assumes that all given entities are named entities. """ self.tokenIsName = {} self.tokenIsEntity = {} self.tokenIsEntityHead = {} # Initialize the dictionaries for token in self.tokens: self.tokenIsName[token] = False self.tokenIsEntity[token] = False self.tokenIsEntityHead[token] = [] for entity in self.entities: entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) entityHeadOffset = Range.charOffsetToSingleTuple( entity.get("headOffset")) for token in self.tokens: tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) for entityOffset in entityOffsets: if Range.overlap(entityOffset, tokenOffset): self.tokenIsEntity[token] = True if entity.get("given") == "True": self.tokenIsName[token] = True # if entity.get("given") != None: # if entity.get("given") == "True": # self.tokenIsName[token] = True # else: # entity.set("given", "True") # self.tokenIsName[token] = True if Range.overlap(entityHeadOffset, tokenOffset): self.tokenIsEntityHead[token].append(entity)
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def selectBestMatch(entity, phrases): entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if entity.get("altOffset") != None: entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset")) best = (sys.maxint, None) for phrase in phrases: matchValue = Range.mismatch(entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset"))) if best[0] > matchValue: best = (matchValue, phrase) return best[1]
def exportChemProtPredictions(xml, outPath, fileTypes="predictions", setNames=None): if fileTypes == "all": fileTypes = ["predictions", "abstracts", "entities", "relations"] elif isinstance(fileTypes, basestring): fileTypes = fileTypes.split(",") for fileType in fileTypes: if fileType not in ["predictions", "abstracts", "entities", "relations"]: raise Exception("Unknown ChemProt file type '" + str(fileType) + "'") xml = ETUtils.ETFromObj(xml) #with open(outPath, "wt") as f outFiles = {} openFiles = {} for document in xml.getiterator("document"): docId = document.get("origId") setName = document.get("set") if setNames != None: setName = setNames.get(setName, setName) if setName not in outFiles: outFiles[setName] = {} outFile = openOutFile(setName, outPath, "abstracts", fileTypes, outFiles, openFiles) if outFile != None: docText = document.get("text") #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib) #title, abstract = docText.split("\t") #titleLength = document.get("titleLength") titleOffset = Range.charOffsetToSingleTuple(document.get("titleOffset")) assert titleOffset[0] == 0 outFile.write("\t".join([docId, docText[:titleOffset[1]], docText[titleOffset[1]+1:]]) + "\n") entityById = {} for entity in document.getiterator("entity"): outFile = openOutFile(setName, outPath, "entities", fileTypes, outFiles, openFiles) if outFile != None: eType = entity.get("type") if entity.get("normalized") != None and entity.get("type") == "GENE": eType += "-Y" if entity.get("normalized") == "True" else "-N" offset = Range.charOffsetToSingleTuple(entity.get("charOffset")) outFile.write("\t".join([docId, entity.get("origId"), eType, str(offset[0]), str(offset[1]), entity.get("text")]) + "\n") assert entity.get("id") not in entityById entityById[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entityById[interaction.get("e1")] e2 = entityById[interaction.get("e2")] outFile = openOutFile(setName, outPath, "relations", fileTypes, outFiles, openFiles) if outFile != None: evaluated = "X" if interaction.get("evaluated") != None: evaluated = "Y " if interaction.get("evaluated") == "True" else "N " outFile.write("\t".join([docId, interaction.get("type"), evaluated, interaction.get("relType"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n") outFile = openOutFile(setName, outPath, "predictions", fileTypes, outFiles, openFiles) if outFile != None: outFile.write("\t".join([docId, interaction.get("type"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n") print >> sys.stderr, "Closing output files" for f in openFiles.values(): f.close() return xml
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def selectBestMatch(entity, phrases): entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if entity.get("altOffset") != None: entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset")) best = (sys.maxint, None) for phrase in phrases: matchValue = Range.mismatch( entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset"))) if best[0] > matchValue: best = (matchValue, phrase) return best[1]
def getNECounts(phrases, entities): counts = {} for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) counts[phrase] = 0 for entity in entities: if entity.get("given") != "True": # only check names continue if Range.contains( phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))): counts[phrase] += 1 return counts
def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def getMatchingPhrases(entity, phraseOffsets, phraseDict): matches = [] if entity.get("isName") == "True": return [] maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) minOffset = entity.get("altOffset") if minOffset != None: minOffset = Range.charOffsetToSingleTuple(minOffset) else: minOffset = maxOffset for phraseOffset in phraseOffsets: if Range.contains(maxOffset, phraseOffset) and Range.contains( phraseOffset, minOffset): matches.extend(phraseDict[phraseOffset]) return matches
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def prepareTokens(self, tokens): tokenTuples = [] for token in tokens: tokenTuples.append( (Range.charOffsetToSingleTuple(token.get("charOffset")), token)) return tokenTuples
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get("text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [str(x) for x in charOffset] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def addSentence(self, sentenceGraph): if sentenceGraph == None: return tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens]) indexByTokenId = {tokens[i][1].get("id"):i for i in range(len(tokens))} assert len(indexByTokenId) == len(tokens) # check that there were no duplicate ids entityById = {x.get("id"):x for x in sentenceGraph.entities} events = {} for interaction in sentenceGraph.interactions: e1Id = interaction.get("e1") e2Id = interaction.get("e2") e1 = entityById[e1Id] e2 = entityById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] index1 = indexByTokenId[t1.get("id")] index2 = indexByTokenId[t2.get("id")] intSpan = abs(index1 - index2) self.interactionSpans[intSpan] = self.interactionSpans.get(intSpan, 0) + 1 self.intSpan["min"] = min(self.intSpan.get("min"), intSpan) self.intSpan["max"] = max(self.intSpan.get("max"), intSpan) if interaction.get("event") == "True": if e1Id not in events: events[e1Id] = {"min":9999, "max":-9999} events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2) events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2) for eventId in sorted(events.keys()): eventSpan = events[eventId]["max"] - events[eventId]["min"] self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1 self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan) self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
def getPatterns(self, e1, e2): e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) tokenPositions = {} for token in self.sentenceGraph.tokens: tokenPositions[token.get("id")] = self.getRelativePosition( e1Range, e2Range, token) prevTokenText = None prevToken2Text = None prevPosition = None patternForeBetween = {} patternBetween = {} patternBetweenAfter = {} for token in self.sentenceGraph.tokens: if self.sentenceGraph.tokenIsName[token]: continue id = token.get("id") text = token.get("text").lower() if prevPosition != tokenPositions[id]: prevTokenText = None prevToken2Text = None if tokenPositions[id] == "Fore": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "Between": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "After": self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) prevPosition = tokenPositions[id] #if tokenPositions[id].find("Entity") != -1: prevToken2Text = prevTokenText prevTokenText = text return patternForeBetween, patternBetween, patternBetweenAfter
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def getPhraseDict(phrases): phraseDict = {} # Define offsets for phrase in phrases: phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if not phraseDict.has_key(phraseOffset): phraseDict[phraseOffset] = [] phraseDict[phraseOffset].append(phrase) return phraseDict
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT": newPhraseOffset = (Range.charOffsetToSingleTuple( tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1]) newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd) if not phraseDict.has_key(newPhraseOffset): #print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] return newPhrases
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) if Range.overlap(sentenceOffset, entityOffset): document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) entity.set("origOffset", entity.get("charOffset")) entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entCount += 1 sentenceCount += 1 # Move interactions intCount = 0 for interaction in document.findall("interaction"): if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: targetSentence = entSentence[interaction.get("e1")] else: targetSentence = entSentence[interaction.get("e2")] document.remove(interaction) targetSentence.append(interaction) interaction.set("id", targetSentence.get("id") + ".i" + str(intCount)) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT": newPhraseOffset = ( Range.charOffsetToSingleTuple(tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1], ) newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd) if not phraseDict.has_key(newPhraseOffset): # print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] return newPhrases
def getTokens(self, entity, tokenTuples): offset = entity.get("charOffset") assert offset != None offset = Range.charOffsetToSingleTuple(offset) match = [] for tokenTuple in tokenTuples: if Range.overlap(offset, tokenTuple[0]): match.append(tokenTuple[1].get("text")) elif len(match) > 0: # passed end break return match
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]): newPhrases = [] for i in range(len(tokens)): token = tokens[i] tokPOS = token.get("POS") if tokPOS in includePOS: tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if not phraseDict.has_key(tokOffset): newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i) newPhrases.append(newPhrase) phraseDict[tokOffset] = [newPhrase] return newPhrases
def getPatterns(self, e1, e2): e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset")) e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset")) tokenPositions = {} for token in self.sentenceGraph.tokens: tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token) prevTokenText = None prevToken2Text = None prevPosition = None patternForeBetween = {} patternBetween = {} patternBetweenAfter = {} for token in self.sentenceGraph.tokens: if self.sentenceGraph.tokenIsName[token]: continue id = token.get("id") text = token.get("text").lower() if prevPosition != tokenPositions[id]: prevTokenText = None prevToken2Text = None if tokenPositions[id] == "Fore": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "Between": self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text) self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) elif tokenPositions[id] == "After": self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text) prevPosition = tokenPositions[id] #if tokenPositions[id].find("Entity") != -1: prevToken2Text = prevTokenText prevTokenText = text return patternForeBetween, patternBetween, patternBetweenAfter
def fixEntities(xml): counts = defaultdict(int) for sentence in xml.getiterator("sentence"): sText = sentence.get("text") for entity in sentence.findall("entity"): charOffset = entity.get("charOffset") if charOffset == "-": assert False, str(entity) sentence.remove(entity) counts["removed-invalid"] += 1 else: charOffset = Range.charOffsetToSingleTuple(charOffset) # fix length realLength = len(entity.get("text")) lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength if lenDiff != realLength: counts["incorrect-ent-offset"] += 1 counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1 if abs(lenDiff) > 2: print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id")) charOffset = (charOffset[0], charOffset[0] + realLength) # find starting position entIndex = sText.find(entity.get("text"), charOffset[0]) if entIndex == -1: for i in [-1,-2,-3]: entIndex = sText.find(entity.get("text"), charOffset[0]+i) if entIndex != -1: break if entIndex != 0: # could be lowercase sTextLower = sText.lower() for i in [0,-1,-2,-3]: lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i) if lowerEntIndex != -1: break if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]): entIndex = lowerEntIndex assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id")) indexDiff = entIndex - charOffset[0] if indexDiff != 0: counts["incorrect-ent-index"] += 1 counts["incorrect-ent-index-diff"+str(indexDiff)] += 1 print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id")) # move offset charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff) # validate new offset sEntity = sText[charOffset[0]:charOffset[1]] assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id")) entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1]))) entity.set("given", "True") for interaction in sentence.findall("interaction"): interaction.set("type", "DDI") print "Fix counts:", counts
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId( sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs(int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
def getRelativePosition(self, entity1Range, entity2Range, token): offset = Range.charOffsetToSingleTuple(token.get("charOffset")) if Range.overlap(entity1Range, offset): return "Entity1" if Range.overlap(entity2Range, offset): return "Entity2" entitiesRange = (min(entity1Range[0], entity2Range[0]), max(entity1Range[1], entity2Range[1])) if offset[1] < entitiesRange[0]: return "Fore" elif offset[1] > entitiesRange[1]: return "After" else: return "Between"
def fixIndices(phrases, tokens): fixCount = 0 phraseCount = 0 for phrase in phrases: fixed = False phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) for i in range(len(tokens)): token = tokens[i] tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) if tokOffset[0] == phraseOffset[0]: if phraseBegin != i: phrase.set("begin", str(i)) fixed = True if tokOffset[1] == phraseOffset[1]: if phraseEnd != i: phrase.set("end", str(i)) fixed = True break if fixed: fixCount += 1 phraseCount += 1
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None): newPhrases = [] for phrase in phrases: if filter != None and phrase.get("type") not in filter: continue phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) phraseBegin = int(phrase.get("begin")) phraseEnd = int(phrase.get("end")) prevToken = None tokCount = 0 for token in tokens[phraseBegin : phraseEnd + 1]: if token.get("POS") == "IN" and prevToken != None: newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1]) newPhrase = makePhrase( phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1 ) if not phraseDict.has_key(newPhraseOffset): # print "NEW PHRASE:", ETUtils.toStr(newPhrase) newPhrases.append(newPhrase) phraseDict[newPhraseOffset] = [newPhrase] prevToken = token tokCount += 1 return newPhrases
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple( entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0] - charOffset[0]:headOffset[1] - charOffset[0] + 1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get( "charOffset" ) in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def removeNamedEntityPhrases(entities, phrases, phraseDict): neOffsets = set() for entity in entities: if entity.get("given") != "True": continue neOffsets.add(entity.get("charOffset")) phrasesToKeep = [] for phrase in phrases: phraseOffset = phrase.get("charOffset") if phraseOffset in neOffsets: phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset) if phraseOffsetTuple in phraseDict: del phraseDict[phraseOffsetTuple] else: phrasesToKeep.append(phrase) # print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases" return phrasesToKeep
def removeNamedEntityPhrases(entities, phrases, phraseDict): neOffsets = set() for entity in entities: if entity.get("given") != "True": continue neOffsets.add(entity.get("charOffset")) phrasesToKeep = [] for phrase in phrases: phraseOffset = phrase.get("charOffset") if phraseOffset in neOffsets: phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset) if phraseOffsetTuple in phraseDict: del phraseDict[phraseOffsetTuple] else: phrasesToKeep.append(phrase) #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases" return phrasesToKeep
def insertElements(corpus, specAnn): for document in corpus.iter('document'): docId = document.get("origId") assert docId in specAnn, docId for sentence in document.iter('sentence'): sentOffset = Range.charOffsetToSingleTuple( sentence.get("charOffset")) analyses = sentence.find("analyses") if not analyses: analyses = ET.SubElement(sentence, "analyses") #entitiesElement = sentence.find("entities") # Find the container container = analyses.find("entities") #None # for entitiesElement in entitiesElements: # if entitiesElement.get("source") == "SPECIES": # container = entitiesElement # break if not container: container = ET.SubElement(analyses, "entities") #container.set("source", "SPECIES") # Map the spans for span in specAnn[docId][:]: offset = span.get("offset") if Range.overlap(offset, sentOffset): if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]: continue specAnn[docId].remove(span) charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0]) matchingText = sentence.get( "text")[charOffset[0]:charOffset[1]] spanText = span.get("text") #print matchingText, spanText assert matchingText == spanText, (matchingText, spanText, charOffset) span.set("charOffset", "-".join([str(x) for x in charOffset])) assert not "--" in span.get("charOffset"), [ str(x) for x in charOffset ] del span.attrib["offset"] #span.set("offset", "") container.append(span)
def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None): self.assertSameSentence(examples) sentenceElement = sentenceObject.sentence sentenceId = sentenceElement.get("id") sentenceText = sentenceElement.get("text") # detach analyses-element sentenceAnalysesElement = None sentenceAnalysesElement = sentenceElement.find("sentenceanalyses") if sentenceAnalysesElement == None: sentenceAnalysesElement = sentenceElement.find("analyses") if sentenceAnalysesElement != None: sentenceElement.remove(sentenceAnalysesElement) # remove pairs and interactions interactions = self.removeChildren(sentenceElement, ["pair", "interaction"]) # remove entities newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity")) nonNameEntities = self.removeNonNameEntities(sentenceElement) # add new pairs for example in examples: prediction = predictionsByExample[example[0]] entityElement = ET.Element("entity") #entityElement.attrib["given"] = "False" headToken = example[3]["t"] for token in sentenceObject.tokens: if token.get("id") == headToken: headToken = token break entityElement.set("charOffset", example[3]["charOffset"]) entityElement.set("headOffset", headToken.get("charOffset")) entityElement.set("phraseType", example[3]["ptype"]) entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"]) entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]]) entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount)) self.setElementType(entityElement, prediction, classSet, classIds) newEntityIdCount += 1 sentenceElement.append(entityElement) # re-attach the analyses-element if sentenceAnalysesElement != None: sentenceElement.append(sentenceAnalysesElement)
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def getEntityHeadToken(entity, tokens, tokenHeadScores): if entity.get("headOffset") != None: charOffsets = Range.charOffsetToTuples(entity.get("headOffset")) elif entity.get("charOffset") != "": charOffsets = Range.charOffsetToTuples(entity.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in tokens: tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens)==1: # An unambiguous head token was found selectedHeadToken = headTokens[0] else: # One head token must be chosen from the candidates selectedHeadToken = findHeadToken(headTokens, tokenHeadScores) #if verbose: # print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"] assert selectedHeadToken != None, entityElement.get("id") return selectedHeadToken
def addSentence(self, sentenceGraph): if sentenceGraph == None: return tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens]) indexByTokenId = { tokens[i][1].get("id"): i for i in range(len(tokens)) } assert len(indexByTokenId) == len( tokens) # check that there were no duplicate ids entityById = {x.get("id"): x for x in sentenceGraph.entities} events = {} for interaction in sentenceGraph.interactions: e1Id = interaction.get("e1") e2Id = interaction.get("e2") e1 = entityById[e1Id] e2 = entityById[e2Id] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] index1 = indexByTokenId[t1.get("id")] index2 = indexByTokenId[t2.get("id")] intSpan = abs(index1 - index2) self.interactionSpans[intSpan] = self.interactionSpans.get( intSpan, 0) + 1 self.intSpan["min"] = min(self.intSpan.get("min"), intSpan) self.intSpan["max"] = max(self.intSpan.get("max"), intSpan) if interaction.get("event") == "True": if e1Id not in events: events[e1Id] = {"min": 9999, "max": -9999} events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2) events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2) for eventId in sorted(events.keys()): eventSpan = events[eventId]["max"] - events[eventId]["min"] self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1 self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan) self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update( 1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def findHeadsDictionary(corpus, stringsFrom, parse, tokenization): print "Extracting triggers from", stringsFrom trigDict = getTriggers(stringsFrom) print "Determining trigger distribution" distDict = getDistribution(trigDict) allStrings = sorted(distDict.keys()) print "Determining heads for", corpus corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus(corpus, parse, tokenization, removeIntersentenceInteractions=False, removeNameInfo=False) cases = {} counts = [0,0] for sentence in corpusElements.sentences: #print sentence.sentence.get("id") sText = sentence.sentence.get("text") #tokenHeadScores = None for entity in sentence.entities: if entity.get("headOffset") != None: continue if entity.get("isName") == "True": # Only for triggers continue #if tokenHeadScores == None: # tokenHeadScores = getTokenHeadScores(sentence.tokens, sentence.dependencies, sentenceId=sentence.sentence.get("id")) eText = entity.get("text") eType = entity.get("type") eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) wsSplits = eText.split() # Split by whitespace if len(wsSplits) == 1 and eText.find("-") == -1: # unambiguous head will be assigned by SYNTAX pass continue else: # Entity text has multiple (whitespace or hyphen separated) parts candidates = [] # Try to find entity substring in individual entity strings for wsTuple in mapSplits(wsSplits, eText, eOffset): if not distDict.has_key(wsTuple[1]): # string not found, low score candidates.append( ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1]) ) else: # String found, more common ones get higher score assert distDict[wsTuple[1]].has_key(eType), (distDict[wsTuple[0]], wsTuple[0], eText) candidates.append( (tuple(distDict[wsTuple[1]][eType]), wsTuple[2], wsTuple[0], wsTuple[1]) ) # Split each whitespace-separated string further into hyphen-separated substrings for candidate in candidates[:]: hyphenSplits = candidate[2].split("-") if len(hyphenSplits) > 1: # Substring has a hyphen # Try to find entity substring in individual entity strings for hyphenTuple in mapSplits(hyphenSplits, eText, candidate[1]): if not distDict.has_key(hyphenTuple[1]): candidates.append( ((-1, -1), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) ) else: candidates.append( (tuple(distDict[hyphenTuple[1]][eType]), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) ) # Sort candidates, highes scores come first candidates.sort(reverse=True) # If not matches, look for substrings inside words if candidates[0][0][0] in [-1, 0]: # no matches, look for substrings print "Substring matching", candidates, "for entity", entity.get("id") for i in range(len(candidates)): candidate = candidates[i] cText = candidate[2] for string in allStrings: subStringPos = cText.find(string) if subStringPos != -1: print " Substring match", string, cText, score = tuple(distDict[string][eType]) if score > candidate[0]: print score, candidate[0], "Substring selected" #, score > candidate[0], score < candidate[0] subStringCoords = [candidate[1][0] + subStringPos, len(string)] candidate = (score, subStringCoords, candidate[2], ">"+string+"<") else: print score, candidate[0] candidates[i] = candidate # Resort after possibly replacing some candidates candidates.sort(reverse=True) if candidates[0][0][0] not in [-1, 0]: # if it is in [-1, 0], let SYNTAX pass take care of it candidateOffset = (candidates[0][1][0] + eOffset[0], candidates[0][1][0] + candidates[0][1][1] + eOffset[0]) entity.set("headOffset", str(candidateOffset[0]) + "-" + str(candidateOffset[1]-1)) entity.set("headMethod", "Dict") entity.set("headString", sText[candidateOffset[0]:candidateOffset[1]]) counts[0] += 1 # Prepare results for printing for i in range(len(candidates)): c = candidates[i] candidates[i] = (tuple(c[0]), c[2], c[3]) case = (eType, eText, tuple(candidates)) if not cases.has_key(case): cases[case] = 0 cases[case] += 1 print entity.get("id"), eType + ": '" + eText + "'", candidates #headToken = getEntityHeadToken(entity, sentence.tokens, tokenHeadScores) # The ElementTree entity-element is modified by setting the headOffset attribute #entity.set("headOffset", headToken.get("charOffset")) #entity.set("headMethod", "Syntax") print "Cases" for case in sorted(cases.keys()): print case, cases[case] #return corpus return counts
def mapInteractions(self, entityElements, interactionElements, verbose=False): """ Maps the semantic interactions to the syntactic graph. Syntactic dependencies are defined between tokens. Semantic edges (interactions) are defined between annotated entities. To utilize the correlation of the dependency parse with the semantic interactions, the graphs must be aligned by mapping the interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This is done by determining the head tokens of the entities. @param entityElements: the semantic nodes (triggers and named entities) @type entityElements: list of cElementTree.Element objects @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA) @type interactionElements: list of cElementTree.Element objects @param verbose: Print selected head tokens on screen @param verbose: boolean Duplicated interactions are skipped in this function. For all gold interactions between two tokens, it only keeps one interaction for each interactions type. """ self.interactions = interactionElements self.entities = entityElements # Entities that have no text binding can not be mapped and are therefore removed for entity in self.entities[:]: if entity.get("charOffset") == "": self.entities.remove(entity) #self.interactionGraph = NX.XDiGraph(multiedges = multiedges) #if multiedges: # self.interactionGraph = NX10.MultiDiGraph() #else: # self.interactionGraph = NX10.DiGraph() self.interactionGraph = Graph() self.interactionGraph.addNodes(self.tokens) #for token in self.tokens: # self.interactionGraph.add_node(token) self.entitiesByToken = {} # a mapping for fast access self.entitiesById = {} self.entityHeadTokenByEntity = {} sentenceSpan = (0, len(self.sentenceElement.get("text")) ) # for validating the entity offsets for entity in self.entities[:]: headToken = self.mapEntity(entity, verbose) if entity.tag != "entity": self.entities.remove(entity) elif headToken != None: self.entityHeadTokenByEntity[entity] = headToken self.entitiesById[entity.get("id")] = entity else: # Check that the entity is within the sentence if not Range.overlap( Range.charOffsetToSingleTuple( entity.get("charOffset")), sentenceSpan): raise Exception("Entity " + entity.get("id") + ", charOffset " + entity.get("charOffset") + ", does not overlap with sentence " + self.sentenceElement.get("id") + ", length " + str(sentenceSpan[1])) # Assume there simply is no token corresponding to the entity self.entities.remove(entity) self._markNamedEntities() for interaction in self.interactions: if (not self.entitiesById.has_key(interaction.get("e1")) ): #and self.entitiesById.has_key(interaction.get("e2")): continue # e1 is outside of this sentence # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]] if (not self.entitiesById.has_key(interaction.get("e2")) ): #and self.entitiesById.has_key(interaction.get("e1")): continue # e2 is outside of this sentence # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] if self.entitiesById.has_key( interaction.get("e1")) and self.entitiesById.has_key( interaction.get("e2")): token1 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e1")]] token2 = self.entityHeadTokenByEntity[self.entitiesById[ interaction.get("e2")]] # else: # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]] # found = False # if multiedges: # edges = self.interactionGraph.get_edge_data(token1, token2, default={}) # for i in range(len(edges)): # edge = edges[i]["element"] # if edge.attrib["type"] == interaction.attrib["type"]: # found = True # break # if not found: # self.interactionGraph.add_edge(token1, token2, element=interaction) # else: # self.duplicateInteractionEdgesRemoved += 1 found = False edges = self.interactionGraph.getEdges(token1, token2) for edge in edges: if edge[2].get("type") == interaction.get("type"): found = True break if not found: self.interactionGraph.addEdge(token1, token2, interaction) else: # TODO: "skipped" would be better than "removed" self.duplicateInteractionEdgesRemoved += 1
def prepareTokens(self, tokens): tokenTuples = [] for token in tokens: tokenTuples.append((Range.charOffsetToSingleTuple(token.get("charOffset")), token)) return tokenTuples
def mapEntity(self, entityElement, verbose=False): """ Determine the head token for a named entity or trigger. The head token is the token closest to the root for the subtree of the dependency parse spanned by the text of the element. @param entityElement: a semantic node (trigger or named entity) @type entityElement: cElementTree.Element @param verbose: Print selected head tokens on screen @param verbose: boolean """ headOffset = None if entityElement.get("headOffset") != None: headOffset = Range.charOffsetToSingleTuple( entityElement.get("headOffset")) if entityElement.get("charOffset") != "": charOffsets = Range.charOffsetToTuples( entityElement.get("charOffset")) else: charOffsets = [] # Each entity can consist of multiple syntactic tokens, covered by its # charOffset-range. One of these must be chosen as the head token. headTokens = [] # potential head tokens for token in self.tokens: #print token.attrib["id"], token.attrib["charOffset"] tokenOffset = Range.charOffsetToSingleTuple( token.get("charOffset")) if headOffset != None and entityElement.get("type") != "Binding": # A head token can already be defined in the headOffset-attribute. # However, depending on the tokenization, even this range may # contain multiple tokens. Still, it can always be assumed that # if headOffset is defined, the corret head token is in this range. if Range.overlap(headOffset, tokenOffset): headTokens.append(token) else: for offset in charOffsets: if Range.overlap(offset, tokenOffset): headTokens.append(token) if len(headTokens) == 1: # An unambiguous head token was found token = headTokens[0] else: # One head token must be chosen from the candidates selHead = None if entityElement.get("type") == "Binding": for t in headTokens: compText = t.get("text").lower() for bindWord in ("bind", "complex", "h**o", "hetero", "dimer"): if bindWord in compText: selHead = t break if selHead != None: break # if compText.find("bind") != -1 or compText.find("complex") != -1: # selHead = t # #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset") # entityElement.set("headOffset", selHead.get("charOffset")) # break # elif "egulation" in entityElement.get("type"): # self.getTokenHeadScores() # regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1] # if len(regulationHeads) > 0: # selHead = regulationHeads[-1] if selHead == None: token = self.findHeadToken(headTokens) else: token = selHead if verbose: print >> sys.stderr, "Selected head:", token.get( "id"), token.get("text") #assert token != None, entityElement.get("id") if token != None: # The ElementTree entity-element is modified by setting the headOffset attribute if entityElement.get("headOffset") == None or entityElement.get( "headOffset") != token.get("charOffset"): entityElement.set("headOffset", token.get("charOffset")) if not self.entitiesByToken.has_key(token): self.entitiesByToken[token] = [] self.entitiesByToken[token].append(entityElement) else: print >> sys.stderr, "Warning, no tokens for entity", entityElement.get( "id") return token
def findHeadsDictionary(corpus, stringsFrom, parse, tokenization): print "Extracting triggers from", stringsFrom trigDict = getTriggers(stringsFrom) print "Determining trigger distribution" distDict = getDistribution(trigDict) allStrings = sorted(distDict.keys()) print "Determining heads for", corpus corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus( corpus, parse, tokenization, removeIntersentenceInteractions=False, removeNameInfo=False) cases = {} counts = [0, 0] for sentence in corpusElements.sentences: #print sentence.sentence.get("id") sText = sentence.sentence.get("text") #tokenHeadScores = None for entity in sentence.entities: if entity.get("headOffset") != None: continue if entity.get("given") == "True": # Only for triggers continue #if tokenHeadScores == None: # tokenHeadScores = getTokenHeadScores(sentence.tokens, sentence.dependencies, sentenceId=sentence.sentence.get("id")) eText = entity.get("text") eType = entity.get("type") eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) wsSplits = eText.split() # Split by whitespace if len(wsSplits) == 1 and eText.find( "-" ) == -1: # unambiguous head will be assigned by SYNTAX pass continue else: # Entity text has multiple (whitespace or hyphen separated) parts candidates = [] # Try to find entity substring in individual entity strings for wsTuple in mapSplits(wsSplits, eText, eOffset): if not distDict.has_key( wsTuple[1]): # string not found, low score candidates.append( ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1])) else: # String found, more common ones get higher score assert distDict[wsTuple[1]].has_key(eType), ( distDict[wsTuple[0]], wsTuple[0], eText) candidates.append((tuple(distDict[wsTuple[1]][eType]), wsTuple[2], wsTuple[0], wsTuple[1])) # Split each whitespace-separated string further into hyphen-separated substrings for candidate in candidates[:]: hyphenSplits = candidate[2].split("-") if len(hyphenSplits) > 1: # Substring has a hyphen # Try to find entity substring in individual entity strings for hyphenTuple in mapSplits(hyphenSplits, eText, candidate[1]): if not distDict.has_key(hyphenTuple[1]): candidates.append( ((-1, -1), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1])) else: candidates.append( (tuple(distDict[hyphenTuple[1]][eType]), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1])) # Sort candidates, highes scores come first candidates.sort(reverse=True) # If not matches, look for substrings inside words if candidates[0][0][0] in [-1, 0]: # no matches, look for substrings print "Substring matching", candidates, "for entity", entity.get( "id") for i in range(len(candidates)): candidate = candidates[i] cText = candidate[2] for string in allStrings: subStringPos = cText.find(string) if subStringPos != -1: print " Substring match", string, cText, score = tuple(distDict[string][eType]) if score > candidate[0]: print score, candidate[ 0], "Substring selected" #, score > candidate[0], score < candidate[0] subStringCoords = [ candidate[1][0] + subStringPos, len(string) ] candidate = (score, subStringCoords, candidate[2], ">" + string + "<") else: print score, candidate[0] candidates[i] = candidate # Resort after possibly replacing some candidates candidates.sort(reverse=True) if candidates[0][0][0] not in [ -1, 0 ]: # if it is in [-1, 0], let SYNTAX pass take care of it candidateOffset = (candidates[0][1][0] + eOffset[0], candidates[0][1][0] + candidates[0][1][1] + eOffset[0]) entity.set( "headOffset", str(candidateOffset[0]) + "-" + str(candidateOffset[1] - 1)) entity.set("headMethod", "Dict") entity.set("headString", sText[candidateOffset[0]:candidateOffset[1]]) counts[0] += 1 # Prepare results for printing for i in range(len(candidates)): c = candidates[i] candidates[i] = (tuple(c[0]), c[2], c[3]) case = (eType, eText, tuple(candidates)) if not cases.has_key(case): cases[case] = 0 cases[case] += 1 print entity.get("id"), eType + ": '" + eText + "'", candidates
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False): containerElements = [docElement] + [x for x in docElement.getiterator("sentence")] for containerElement in containerElements: for entity in containerElement.findall("entity"): eType = entity.get("type") if eType == "neg": # skip negative predictions if they are present continue assert entity.get("id") != None entityElementMap[entity.get("id")] = entity entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) ann = Annotation() ann.type = eType if useOrigIds: entityOrigId = entity.get("origId") if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id entityOrigId = entityOrigId.rsplit(".",1)[-1] if entityOrigId != None: if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event ann.eventId = entityOrigId ann.id = None else: ann.id = entityOrigId ann.text = entity.get("text") if entity.get("normalization") != None: ann.normalization = entity.get("normalization") #assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset) ann.charOffsets = entityOffsets #ann.charBegin = entityOffset[0] #ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1 if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document sentenceOffset = Range.charOffsetToSingleTuple(containerElement.get("charOffset")) for i in range(len(ann.charOffsets)): ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0]) #ann.charBegin += sentenceOffset[0] #ann.charEnd += sentenceOffset[0] # idStem = entity.get("id").split(".e", 1)[0] # if sentenceOffsets.has_key(idStem): # sentenceOffset = sentenceOffsets[idStem] # ann.charBegin += sentenceOffset[0] # ann.charEnd += sentenceOffset[0] if entity.get("speculation") == "True": ann.speculation = True if entity.get("negation") == "True": ann.negation = True ann.extra = getExtraFromElement(entity) # add all scores and extra data if entity.get("given") == "True": # Remember to use original id for names! if entity.get("origId") != None: ann.id = entity.get("origId").rsplit(".", 1)[-1] assert ann.id[0].isupper(), ann.id for c in ann.id[1:]: assert c.isdigit(), ann.id doc.proteins.append(ann) tMap[entity.get("id")] = ann # The part below is dangerous, and incompatibilities should be handled rather # by not converting to the shared task format when it cannot be done #if entity.get("origId") != None: # # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format # nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1] # if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit(): # ann.id = nonNamedEntityOrigId #stDoc.proteins.append(ann) else: # a predicted protein or trigger duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers) if duplicateAnn == None: doc.triggers.append(ann) tMap[entity.get("id")] = ann # Add confidence scores #ann.extra = getExtraFromElement(entity, ["conf"]) #ann.triggerScores = entity.get("predictions") #ann.unmergingScores = entity.get("umStrength") #ann.speculationScores = entity.get("modPred") #ann.negationScores = entity.get("modPred") # Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions if entity.get("event") == "True": event = makeSTEvent(ann, entityElementMap[entity.get("id")]) eMap[entity.get("id")] = event doc.events.append(event) else: # a duplicate trigger already exists tMap[entity.get("id")] = duplicateAnn
def moveElements(document): entMap = {} entSentence = {} entSentenceIndex = {} sentences = document.findall("sentence") sentenceCount = 0 for sentence in sentences: sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) # Move entities entCount = 0 for entity in document.findall("entity"): entityOffsets = Range.charOffsetToTuples(entity.get("charOffset")) overlaps = False for entityOffset in entityOffsets: if Range.overlap(sentenceOffset, entityOffset): overlaps = True break if overlaps: document.remove(entity) sentence.append(entity) entityId = entity.get("id") entityIdLastPart = entityId.rsplit(".", 1)[-1] if entityIdLastPart.startswith("e"): entity.set("id", sentence.get("id") + "." + entityIdLastPart) entMap[entityId] = sentence.get("id") + "." + entityIdLastPart else: entity.set("docId", entityId) entity.set("id", sentence.get("id") + ".e" + str(entCount)) entMap[entityId] = sentence.get("id") + ".e" + str(entCount) entSentence[entityId] = sentence entSentenceIndex[entityId] = sentenceCount #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newEntityOffsets = [] for entityOffset in entityOffsets: newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) newOffset = (max(0, newOffset[0]), max(0, newOffset[1])) if newOffset != (0, 0): assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset) newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) ) assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset) entity.set("origOffset", entity.get("charOffset")) #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) entCount += 1 sentenceCount += 1 if len([x for x in document.findall("entity")]) != 0: raise Exception("Sentence splitting does not cover the entire document") # Move interactions intCount = 0 interactions = [] interactionOldToNewId = {} for interaction in document.findall("interaction"): interactions.append(interaction) #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]: # targetSentence = entSentence[interaction.get("e1")] #else: # targetSentence = entSentence[interaction.get("e2")] # Interactions go to a sentence always by e1, as this is the event they are an argument of. # If an intersentence interaction is a relation, this shouldn't matter. targetSentence = entSentence[interaction.get("e1")] document.remove(interaction) targetSentence.append(interaction) newId = targetSentence.get("id") + ".i" + str(intCount) interactionOldToNewId[interaction.get("id")] = newId interaction.set("id", newId) interaction.set("e1", entMap[interaction.get("e1")]) interaction.set("e2", entMap[interaction.get("e2")]) intCount += 1 for interaction in interactions: if interaction.get("siteOf") != None: interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
def orderTokens(token1, token2): offset1 = Range.charOffsetToSingleTuple(token1.get("charOffset")) offset2 = Range.charOffsetToSingleTuple(token1.get("charOffset")) return Range.order(offset1, offset2)
def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): """ Build a single directed example for the potential edge between token1 and token2 """ # define features features = {} if not self.styles["no_path"]: path = paths.getPaths(token1, token2) if len(path) > 0: path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures( entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_" + e1Type)] = 1 features[self.featureSet.getId("e2_" + e2Type)] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_" + e1Type)] = 1 features[self.featureSet.getId("BI_e2_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_" + e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_" + e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_" + e1SuperType + "_" + e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector( features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) # define extra attributes if int(path[0].get("charOffset").split("-")[0]) < int( path[-1].get("charOffset").split("-")[0]): extra = { "xtype": "edge", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: extra = { "xtype": "edge", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e1DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1] ]) if entity2 != None: extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2] ]) extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace( ":", "-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace( ":", "-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId extra["directed"] = str(isDirected) return (categoryName, features, extra)
def toSTFormat( input, output=None, outputTag="a2", useOrigIds=False, debug=False, task=2, validate=True, writeScores=False ): print >>sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >>sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() nonEntitySiteCount = 0 documents = [] for document in corpusRoot.findall("document"): stDoc = Document() stDoc.proteins = [] stDoc.triggers = [] stDoc.events = [] stDoc.relations = [] stDoc.id = document.get("pmid") if stDoc.id == None: stDoc.id = document.get("origId") stDoc.text = "" documents.append(stDoc) eMap = {} tMap = {} siteMap = {} siteScores = {} sites = [] sentenceOffsets = {} for sentence in document.findall("sentence"): head = sentence.get("head") if head != None: stDoc.text += head stDoc.text += sentence.get("text") tail = sentence.get("tail") if tail != None: stDoc.text += tail sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) sentenceOffsets[sentence.get("id")] = sentenceOffset if stDoc.id == None: stDoc.id = sentence.get("origId").rsplit(".", 1)[0] entityElementMap = {} # for task 3 for entity in document.getiterator("entity"): eType = entity.get("type") if eType == "neg": continue entityElementMap[entity.get("id")] = entity entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset")) ann = Annotation() ann.type = eType if useOrigIds: entityOrigId = entity.get("origId") if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id entityOrigId = entityOrigId.rsplit(".", 1)[-1] if entityOrigId != None: if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event ann.eventId = entityOrigId ann.id = None else: ann.id = entityOrigId ann.text = entity.get("text") assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset) ann.charBegin = entityOffset[0] ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1 idStem = entity.get("id").split(".e", 1)[0] if sentenceOffsets.has_key(idStem): sentenceOffset = sentenceOffsets[idStem] ann.charBegin += sentenceOffset[0] ann.charEnd += sentenceOffset[0] if entity.get("speculation") == "True": ann.speculation = True if entity.get("negation") == "True": ann.negation = True if entity.get("isName") == "True": # Remember to use original id for names! if entity.get("origId") != None: ann.id = entity.get("origId").rsplit(".", 1)[-1] assert ann.id[0].isupper(), ann.id for c in ann.id[1:]: assert c.isdigit(), ann.id stDoc.proteins.append(ann) # The part below is dangerous, and incompatibilities should be handled rather # by not converting to the shared task format when it cannot be done # if entity.get("origId") != None: # # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format # nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1] # if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit(): # ann.id = nonNamedEntityOrigId # stDoc.proteins.append(ann) else: found = False # prevent duplicate triggers for trigger in stDoc.triggers: if ( trigger.charBegin == ann.charBegin and trigger.charEnd == ann.charEnd and trigger.text == ann.text and trigger.type == ann.type ): found = True ann = trigger break if not found: stDoc.triggers.append(ann) assert entity.get("id") != None tMap[entity.get("id")] = ann if entity.get("type") == "Process": # these can have 0 interactions event = Annotation() event.trigger = ann event.type = event.trigger.type eMap[entity.get("id")] = event if entityElementMap[entity.get("id")].get("speculation") == "True": event.speculation = True if entityElementMap[entity.get("id")].get("negation") == "True": event.negation = True stDoc.events.append(event) # Add confidence scores ann.triggerScores = entity.get("predictions") ann.unmergingScores = entity.get("umStrength") ann.speculationScores = entity.get("modPred") ann.negationScores = entity.get("modPred") # First map Coref proteins corefProtMap = {} for interaction in document.getiterator("interaction"): intType = interaction.get("type") if intType == "Target": e1 = interaction.get("e1") e2 = interaction.get("e2") if not tMap.has_key(e2): print >>sys.stderr, "Warning, no trigger for Coref Protein Target" continue e2 = tMap[e2] if not corefProtMap.has_key(e1): corefProtMap[e1] = [] if not e2 in corefProtMap[e1]: corefProtMap[e1].append(e2) # Then process all interactions for interaction in document.getiterator("interaction"): intType = interaction.get("type") if intType == "neg" or intType == "Target": continue # Targets have already been put into a dictionary # elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]: # elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation", # "InputAssociation", "InputProcess", "InputInhibitor", "OutputProcess"]: if "/" in intType and "(" in intType: # BI-task eventType, argTypes = intType.split("(") arg1Type, arg2Type = argTypes[:-1].split("/") event = Annotation() event.trigger = None # triggerless event (same as relation) event.type = eventType event.arguments.append([arg1Type, interaction.get("e1"), None]) event.arguments.append([arg2Type, interaction.get("e2"), None]) if event.arguments[0][0] == "SiteArg": # convert back to actual sites event.arguments[0][0] = "Site" if event.arguments[1][0] == "SiteArg": # convert back to actual sites event.arguments[1][0] = "Site" # event.speculation = entityElementMap[e1].get("speculation") # event.negation = entityElementMap[e1].get("negation") stDoc.events.append(event) elif intType not in [ "Protein-Component", "Subunit-Complex", "Renaming", "Coref", "SR-subunitof", "SR-equivto", "SR-partof", "SR-memberof", ]: # if intType == "Site" and tMap[interaction.get("e1")].type == "Entity": if intType == "Site": # These sites are real sites (i.e. task 2 sites). # Other sites are just arguments called "site" # sites.append(interaction) siteMap[interaction.get("e2")] = tMap[interaction.get("e1")] siteScores[interaction.get("e2")] = interaction.get("predictions") else: e1 = interaction.get("e1") if eMap.has_key(e1): # event has already been created event = eMap[e1] # eMap lists events by their trigger ids else: eventType = tMap[interaction.get("e1")].type if eventType != "Entity": # "Entity"-type entities are never event roots event = Annotation() event.trigger = tMap[interaction.get("e1")] event.type = event.trigger.type if hasattr(event.trigger, "eventId"): event.id = event.trigger.eventId eMap[e1] = event if entityElementMap[e1].get("speculation") == "True": event.speculation = True if entityElementMap[e1].get("negation") == "True": event.negation = True stDoc.events.append(event) else: event = None if event != None: arg = [interaction.get("type"), interaction.get("e2"), None, interaction.get("predictions")] if arg[0] == "SiteArg": # convert back to actual sites arg[0] = "Site" if arg[3] != None: # Convert also prediction strengths arg[3] = arg[3].replace("SiteArg", "Site") event.arguments.append(arg) else: # interaction is a relation rel = Annotation() rel.type = interaction.get("type") e1 = interaction.get("e1") e2 = interaction.get("e2") relScores = interaction.get("predictions") # assert rel.type == "Protein-Component" or rel.type == "Subunit-Complex" or rel.type == "Renaming", (rel.type, stDoc.id, interaction.get("id")) if rel.type == "Protein-Component" or rel.type == "Subunit-Complex": rel.arguments.append(["Arg1", tMap[e1], None, relScores]) rel.arguments.append(["Arg2", tMap[e2], None, relScores]) elif rel.type == "Renaming": rel.arguments.append(["Former", tMap[e1], None, relScores]) rel.arguments.append(["New", tMap[e2], None, relScores]) elif rel.type == "Coref": rel.arguments.append(["Anaphora", tMap[e1], None, relScores]) rel.arguments.append(["Antecedent", tMap[e2], None, relScores]) # Add protein arguments' if corefProtMap.has_key(e2): for prot in corefProtMap[e2]: rel.arguments.append(["Target", prot, None]) elif rel.type.startswith("SR-"): rel.arguments.append(["Arg1", tMap[e1], None, relScores]) rel.arguments.append(["Arg2", tMap[e2], None, relScores]) else: assert False, (rel.type, stDoc.id, interaction.get("id")) stDoc.relations.append(rel) # Map argument targets for event in stDoc.events: for arg in event.arguments[:]: if arg[1] == None: assert False continue id = arg[1] if eMap.has_key(id): arg[1] = eMap[id] elif tMap.has_key(id): arg[1] = tMap[id] ## Remove Entity-type triggers if they are Regulation-arguments # if "egulation" in event.type and tMap[id].type != "Protein": # event.arguments.remove(arg) # add sites if siteMap.has_key(id): if siteMap[id].type == "Entity": assert id not in eMap assert id in tMap arg[2] = siteMap[id] if id in siteScores and siteScores[id] != None: while len(arg) < 5: arg += [None] assert arg[4] == None arg[4] = siteScores[id] else: nonEntitySiteCount += 1 # assert siteMap[id].type == "Entity", (stDoc.id, event.id, id, siteMap[id].id, siteMap[id].type) # # Remove eventless triggers # triggersToKeep = [] # for trigger in stDoc.triggers: # if trigger.type == "Entity": # triggersToKeep.append(trigger) # else: # for event in stDoc.events: # if event.trigger == trigger: # triggersToKeep.append(trigger) # break # stDoc.triggers = triggersToKeep # Sort arguments # for eKey in sorted(eMap.keys()): # event = eMap[eKey] # event.arguments.sort(cmp=compareArguments) # Create STFormat ids # updateIds(stDoc.proteins) # updateIds(stDoc.triggers, getMaxId(stDoc.proteins) + 1) # updateIds(stDoc.events) # updateIds(stDoc.relations) if nonEntitySiteCount > 0: print >>sys.stderr, "Warning, discarded", nonEntitySiteCount, "non-entity sites" if output != None: print >>sys.stderr, "Writing output to", output writeSet( documents, output, resultFileTag=outputTag, debug=debug, task=task, validate=validate, writeScores=writeScores, ) return documents
def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path): features = {} if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["sdb_features"]: e1Type = entity1.get("type") e2Type = entity2.get("type") features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1 features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1 features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1 if e1Type == e2Type: features[self.featureSet.getId("SDB_e1e2_equal")] = 1 features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1 e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type)) e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type)) for e1SuperType in e1SuperTypes: for e2SuperType in e2SuperTypes: features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if e1SuperType == e2SuperType: features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1 features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1 if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2) self.ontobiotopeFeatureBuilder.setFeatureVector(None) if self.styles["full_entities"]: e1Text = entity1.get("text").lower() e2Text = entity2.get("text").lower() features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1 features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1 for ep1 in e1Text.split(): for ep2 in e2Text.split(): features[self.featureSet.getId("FULL_e1_"+ep1)] = 1 features[self.featureSet.getId("FULL_e2_"+ep2)] = 1 features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["wordnet"]: self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2) self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordNetFeatureBuilder.buildPathFeatures(path) self.wordNetFeatureBuilder.setFeatureVector(None) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_") self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_") self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordVectorFeatureBuilder.buildPathFeatures(path) self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2)) self.wordVectorFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) return features
def exportChemProtPredictions(xml, outPath, fileTypes="predictions", setNames=None): if fileTypes == "all": fileTypes = ["predictions", "abstracts", "entities", "relations"] elif isinstance(fileTypes, basestring): fileTypes = fileTypes.split(",") for fileType in fileTypes: if fileType not in [ "predictions", "abstracts", "entities", "relations" ]: raise Exception("Unknown ChemProt file type '" + str(fileType) + "'") xml = ETUtils.ETFromObj(xml) #with open(outPath, "wt") as f outFiles = {} openFiles = {} for document in xml.getiterator("document"): docId = document.get("origId") setName = document.get("set") if setNames != None: setName = setNames.get(setName, setName) if setName not in outFiles: outFiles[setName] = {} outFile = openOutFile(setName, outPath, "abstracts", fileTypes, outFiles, openFiles) if outFile != None: docText = document.get("text") #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib) #title, abstract = docText.split("\t") #titleLength = document.get("titleLength") titleOffset = Range.charOffsetToSingleTuple( document.get("titleOffset")) assert titleOffset[0] == 0 outFile.write("\t".join([ docId, docText[:titleOffset[1]], docText[titleOffset[1] + 1:] ]) + "\n") entityById = {} for entity in document.getiterator("entity"): outFile = openOutFile(setName, outPath, "entities", fileTypes, outFiles, openFiles) if outFile != None: eType = entity.get("type") if entity.get("normalized") != None and entity.get( "type") == "GENE": eType += "-Y" if entity.get( "normalized") == "True" else "-N" offset = Range.charOffsetToSingleTuple( entity.get("charOffset")) outFile.write("\t".join([ docId, entity.get("origId"), eType, str(offset[0]), str(offset[1]), entity.get("text") ]) + "\n") assert entity.get("id") not in entityById entityById[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entityById[interaction.get("e1")] e2 = entityById[interaction.get("e2")] outFile = openOutFile(setName, outPath, "relations", fileTypes, outFiles, openFiles) if outFile != None: evaluated = "X" if interaction.get("evaluated") != None: evaluated = "Y " if interaction.get( "evaluated") == "True" else "N " outFile.write("\t".join([ docId, interaction.get("type"), evaluated, interaction.get("relType"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId") ]) + "\n") outFile = openOutFile(setName, outPath, "predictions", fileTypes, outFiles, openFiles) if outFile != None: outFile.write("\t".join([ docId, interaction.get("type"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId") ]) + "\n") print >> sys.stderr, "Closing output files" for f in openFiles.values(): f.close() return xml
def mergeSentences(input, output, verbose=False): print >> sys.stderr, "Merging sentences into documents" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): counts["documents"] += 1 # Check that the entity has only sentence elements as children children = [x for x in document] docChildTypes = sorted(set([x.tag for x in children])) if len(docChildTypes) == 0: counts["documents-with-no-sentences"] += 1 continue elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence": raise Exception("Document '" + str(document.get("id")) + "' has non-sentence children: " + str(docChildTypes)) # Process all the child sentence elements docId = document.get("id") interactions = [] entities = [] entityById = {} interactionById = {} combinedText = "" calculatedOffset = (0, 0) for sentence in children: document.remove(sentence) sentenceText = sentence.get("head", "") + sentence.get( "text", "") + sentence.get("tail", "") sentOffset = sentence.get("charOffset") if sentence == children[0]: noDefinedOffsets = sentOffset == None elif (sentOffset == None) != noDefinedOffsets: raise Exception("Only some sentences in document '" + docId + "' have defined offsets") if sentOffset == None: if sentence != children[-1]: sentenceText = sentenceText + " " calculatedOffset = (calculatedOffset[1], calculatedOffset[1] + len(sentenceText)) sentOffset = calculatedOffset else: sentOffset = Range.charOffsetToSingleTuple(sentOffset) combinedText += sentenceText # Collect and update the entity elements for entity in sentence.findall("entity"): # Map sentence-level entity offsets to document level for offsetKey in ("charOffset", "headOffset"): if entity.get(offsetKey) != None: offset = Range.charOffsetToTuples( entity.get(offsetKey)) for i in range(len(offset)): offset[i] = (offset[i][0] + sentOffset[0], offset[i][1] + sentOffset[0]) entity.set(offsetKey, Range.tuplesToCharOffset(offset)) # Compare mapped offsets to origOffset, if available if entity.get("origOffset") != None: if entity.get("charOffset") != entity.get("origOffset"): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' new charOffset differs from origOffset: " + str([ entity.get("charOffset"), entity.get("origOffset") ])) counts["checked-origOffsets"] += 1 del entity.attrib["origOffset"] assert entity.get("id") not in entityById entityById[entity.get( "id" )] = entity # For re-mapping the interaction 'e1' and 'e2' attributes entities.append(entity) counts["moved-entities"] += 1 # Collect and update the interaction elements for interaction in sentence.findall("interaction"): assert interaction.get("id") not in interactionById interactionById[interaction.get( "id" )] = interaction # For re-mapping the interaction 'siteOf' attributes interactions.append(interaction) counts["moved-interactions"] += 1 # Check that the combined sentence text matches the document text, if available if document.get("text") != None and document.get( "text") != combinedText: if combinedText == document.get( "text")[0:len(combinedText)] and document.get( "text")[len(combinedText):].strip() == "": if verbose: print >> sys.stderr, "Warning, document '" + document.get( "id" ) + "' text has trailing whitespace not included in the combined sentence text" combinedText = document.get("text") counts["missing-trailing-whitespace"] += 1 else: raise Exception( "Document '" + str(document.get("id")) + "' text differs from combined sentence text: " + str([document.get("text"), combinedText])) counts["checked-document-texts"] += 1 # Check that the entities' texts match the document text for entity in entities: offset = Range.charOffsetToTuples(entity.get("charOffset")) if len(offset) == 1: # Compare only continous entities if not Range.contains((0, len(combinedText)), offset[0]): raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' offset is not contained in combined sentence text: " + str([ entity.attrib, offset, [0, len(combinedText)], combinedText ])) combTextSpan = combinedText[offset[0][0]:offset[0][1]] if entity.get("text") != combTextSpan: raise Exception( "Document '" + str(document.get("id")) + "' entity '" + str(entity.get("id")) + "' text does not match combined sentence text: " + str([entity.get("text"), combTextSpan])) counts["checked-charOffsets"] += 1 # Set the combined text as the document text document.set("text", combinedText) # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping) for i in range(len(entities)): entities[i].set("id", docId + ".e" + str(i)) # Update the id for the document level for i in range(len(interactions)): interaction.set("id", docId + ".i" + str(i)) # Update the id for the document level # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences) for i in range(len(interactions)): interaction = interactions[i] for entKey in ("e1", "e2"): interaction.set(entKey, entityById[interaction.get(entKey)].get("id")) if interaction.get("siteOf") != None: interaction.set( "siteOf", interactionById[interaction.get("siteOf")].get("id")) # Add the entity and interaction elements to the document document.extend(entities) document.extend(interactions) print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree