Exemplo n.º 1
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin:phraseEnd + 1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0],
                                   Range.charOffsetToSingleTuple(
                                       prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(
                    phrase.get("type") + "-IN", newPhraseOffset, phraseBegin,
                    phraseBegin + tokCount - 1)
                if not phraseDict.has_key(newPhraseOffset):
                    #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Exemplo n.º 2
0
 def _markNamedEntities(self):
     """
     This method is used to define which tokens belong to _named_ entities.
     Named entities are sometimes masked when testing learning of interactions, to
     prevent the system making a trivial decision based on commonly interacting names.
     """
     self.tokenIsName = {}
     self.tokenIsEntity = {}
     self.tokenIsEntityHead = {}
     # Initialize the dictionaries
     for token in self.tokens:
         self.tokenIsName[token] = False
         self.tokenIsEntity[token] = False
         self.tokenIsEntityHead[token] = []
     for entity in self.entities:
         entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
         entityHeadOffset = Range.charOffsetToSingleTuple(entity.get("headOffset"))
         for token in self.tokens:
             tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
             for entityOffset in entityOffsets:
                 if Range.overlap(entityOffset, tokenOffset):
                     self.tokenIsEntity[token] = True
                     if entity.get("isName") != None:
                         if entity.get("isName") == "True":
                             self.tokenIsName[token] = True
                     else:
                         entity.set("isName", "True")
                         self.tokenIsName[token] = True
             if Range.overlap(entityHeadOffset, tokenOffset):
                 self.tokenIsEntityHead[token].append(entity)
Exemplo n.º 3
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(
             phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(
                         int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues:
                         features["_metamap_" + attrName + "_" +
                                  attrValue.replace(" ", "-")] = 1
Exemplo n.º 4
0
    def _markNamedEntities(self):
        """
        This method is used to define which tokens belong to _named_ entities.
        Named entities are sometimes masked when testing learning of interactions, to
        prevent the system making a trivial decision based on commonly interacting names.
        This function assumes that all given entities are named entities.
        """
        self.tokenIsName = {}
        self.tokenIsEntity = {}
        self.tokenIsEntityHead = {}
        # Initialize the dictionaries
        for token in self.tokens:
            self.tokenIsName[token] = False
            self.tokenIsEntity[token] = False
            self.tokenIsEntityHead[token] = []
        for entity in self.entities:
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            entityHeadOffset = Range.charOffsetToSingleTuple(
                entity.get("headOffset"))
            for token in self.tokens:
                tokenOffset = Range.charOffsetToSingleTuple(
                    token.get("charOffset"))
                for entityOffset in entityOffsets:
                    if Range.overlap(entityOffset, tokenOffset):
                        self.tokenIsEntity[token] = True
                        if entity.get("given") == "True":
                            self.tokenIsName[token] = True
#                        if entity.get("given") != None:
#                            if entity.get("given") == "True":
#                                self.tokenIsName[token] = True
#                        else:
#                            entity.set("given", "True")
#                            self.tokenIsName[token] = True
                if Range.overlap(entityHeadOffset, tokenOffset):
                    self.tokenIsEntityHead[token].append(entity)
Exemplo n.º 5
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText): headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0]-charOffset[0]:headOffset[1]-charOffset[0]+1]
                if not headDict[eType].has_key(headText): headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get("charOffset") in headOffsetStrings: # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText): headDict["None"][headText] = 0
                headDict["None"][headText] += 1
                
    return headDict
Exemplo n.º 6
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Exemplo n.º 7
0
def exportChemProtPredictions(xml, outPath, fileTypes="predictions", setNames=None):
    if fileTypes == "all":
        fileTypes = ["predictions", "abstracts", "entities", "relations"]
    elif isinstance(fileTypes, basestring):
        fileTypes = fileTypes.split(",")
    for fileType in fileTypes:
        if fileType not in ["predictions", "abstracts", "entities", "relations"]:
            raise Exception("Unknown ChemProt file type '" + str(fileType) + "'")
    xml = ETUtils.ETFromObj(xml)
    #with open(outPath, "wt") as f
    outFiles = {}
    openFiles = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        setName = document.get("set")
        if setNames != None:
            setName = setNames.get(setName, setName)
        if setName not in outFiles:
            outFiles[setName] = {}
        outFile = openOutFile(setName, outPath, "abstracts", fileTypes, outFiles, openFiles)
        if outFile != None:
            docText = document.get("text")
            #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib)
            #title, abstract = docText.split("\t")
            #titleLength = document.get("titleLength")
            titleOffset = Range.charOffsetToSingleTuple(document.get("titleOffset"))
            assert titleOffset[0] == 0
            outFile.write("\t".join([docId, docText[:titleOffset[1]], docText[titleOffset[1]+1:]]) + "\n")  
        entityById = {}
        for entity in document.getiterator("entity"):
            outFile = openOutFile(setName, outPath, "entities", fileTypes, outFiles, openFiles)
            if outFile != None:
                eType = entity.get("type")
                if entity.get("normalized") != None and entity.get("type") == "GENE":
                    eType += "-Y" if entity.get("normalized") == "True" else "-N"
                offset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
                outFile.write("\t".join([docId, entity.get("origId"), eType, str(offset[0]), str(offset[1]), entity.get("text")]) + "\n")
            assert entity.get("id") not in entityById
            entityById[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entityById[interaction.get("e1")]
            e2 = entityById[interaction.get("e2")]
            outFile = openOutFile(setName, outPath, "relations", fileTypes, outFiles, openFiles)
            if outFile != None:
                evaluated = "X"
                if interaction.get("evaluated") != None:
                    evaluated = "Y " if interaction.get("evaluated") == "True" else "N "
                outFile.write("\t".join([docId, interaction.get("type"), evaluated, interaction.get("relType"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n")
            outFile = openOutFile(setName, outPath, "predictions", fileTypes, outFiles, openFiles)
            if outFile != None:
                outFile.write("\t".join([docId, interaction.get("type"), "Arg1:" + e1.get("origId"), "Arg2:" + e2.get("origId")]) + "\n")
    print >> sys.stderr, "Closing output files"
    for f in openFiles.values():
        f.close()
    return xml 
Exemplo n.º 8
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("given") != "True":  # only check names
                continue
            if Range.contains(phraseOffset, Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Exemplo n.º 9
0
def selectBestMatch(entity, phrases):
    entOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    if entity.get("altOffset") != None:
        entOffset = Range.charOffsetToSingleTuple(entity.get("altOffset"))
    best = (sys.maxint, None)
    for phrase in phrases:
        matchValue = Range.mismatch(
            entOffset, Range.charOffsetToSingleTuple(phrase.get("charOffset")))
        if best[0] > matchValue:
            best = (matchValue, phrase)
    return best[1]
Exemplo n.º 10
0
def getNECounts(phrases, entities):
    counts = {}
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        counts[phrase] = 0
        for entity in entities:
            if entity.get("given") != "True":  # only check names
                continue
            if Range.contains(
                    phraseOffset,
                    Range.charOffsetToSingleTuple(entity.get("charOffset"))):
                counts[phrase] += 1
    return counts
Exemplo n.º 11
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Exemplo n.º 12
0
def getMatchingPhrases(entity, phraseOffsets, phraseDict):
    matches = []
    if entity.get("isName") == "True":
        return []
    maxOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
    minOffset = entity.get("altOffset")
    if minOffset != None:
        minOffset = Range.charOffsetToSingleTuple(minOffset)
    else:
        minOffset = maxOffset
    for phraseOffset in phraseOffsets:
        if Range.contains(maxOffset, phraseOffset) and Range.contains(
                phraseOffset, minOffset):
            matches.extend(phraseDict[phraseOffset])
    return matches
Exemplo n.º 13
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 14
0
 def prepareTokens(self, tokens):
     tokenTuples = []
     for token in tokens:
         tokenTuples.append(
             (Range.charOffsetToSingleTuple(token.get("charOffset")),
              token))
     return tokenTuples
Exemplo n.º 15
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities") #None
#             for entitiesElement in entitiesElements:
#                 if entitiesElement.get("source") == "SPECIES":
#                     container = entitiesElement
#                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0], offset[1] - sentOffset[0])
                    matchingText = sentence.get("text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText, charOffset)
                    span.set("charOffset", "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [str(x) for x in charOffset]
                    del span.attrib["offset"] #span.set("offset", "")
                    container.append(span)
Exemplo n.º 16
0
 def addSentence(self, sentenceGraph):
     if sentenceGraph == None:
         return
     tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")), x) for x in sentenceGraph.tokens])
     indexByTokenId = {tokens[i][1].get("id"):i for i in range(len(tokens))}
     assert len(indexByTokenId) == len(tokens) # check that there were no duplicate ids
     entityById = {x.get("id"):x for x in sentenceGraph.entities}
     events = {}
     for interaction in sentenceGraph.interactions:
         e1Id = interaction.get("e1")
         e2Id = interaction.get("e2")
         e1 = entityById[e1Id]
         e2 = entityById[e2Id]
         t1 = sentenceGraph.entityHeadTokenByEntity[e1]
         t2 = sentenceGraph.entityHeadTokenByEntity[e2]
         index1 = indexByTokenId[t1.get("id")]
         index2 = indexByTokenId[t2.get("id")]
         intSpan = abs(index1 - index2)
         self.interactionSpans[intSpan] = self.interactionSpans.get(intSpan, 0) + 1
         self.intSpan["min"] = min(self.intSpan.get("min"), intSpan)
         self.intSpan["max"] = max(self.intSpan.get("max"), intSpan)
         if interaction.get("event") == "True":
             if e1Id not in events:
                 events[e1Id] = {"min":9999, "max":-9999}
             events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2)
             events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2)
     for eventId in sorted(events.keys()):
         eventSpan = events[eventId]["max"] - events[eventId]["min"]
         self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1
         self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan)
         self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
Exemplo n.º 17
0
    def getPatterns(self, e1, e2):
        e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
        e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))

        tokenPositions = {}
        for token in self.sentenceGraph.tokens:
            tokenPositions[token.get("id")] = self.getRelativePosition(
                e1Range, e2Range, token)

        prevTokenText = None
        prevToken2Text = None
        prevPosition = None
        patternForeBetween = {}
        patternBetween = {}
        patternBetweenAfter = {}
        for token in self.sentenceGraph.tokens:
            if self.sentenceGraph.tokenIsName[token]:
                continue

            id = token.get("id")
            text = token.get("text").lower()

            if prevPosition != tokenPositions[id]:
                prevTokenText = None
                prevToken2Text = None

            if tokenPositions[id] == "Fore":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "Between":
                self.addToPattern(patternForeBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetween, text, prevTokenText,
                                  prevToken2Text)
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)
            elif tokenPositions[id] == "After":
                self.addToPattern(patternBetweenAfter, text, prevTokenText,
                                  prevToken2Text)

            prevPosition = tokenPositions[id]
            #if tokenPositions[id].find("Entity") != -1:
            prevToken2Text = prevTokenText
            prevTokenText = text

        return patternForeBetween, patternBetween, patternBetweenAfter
Exemplo n.º 18
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 19
0
def getPhraseDict(phrases):
    phraseDict = {}
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Exemplo n.º 20
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT":
            newPhraseOffset = (Range.charOffsetToSingleTuple(
                tokens[phraseBegin - 1].get("charOffset"))[0], phraseOffset[1])
            newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset,
                                   phraseBegin - 1, phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                #print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 21
0
def getPhraseDict(phrases):
    phraseDict = {}
    # Define offsets
    for phrase in phrases:
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        if not phraseDict.has_key(phraseOffset):
            phraseDict[phraseOffset] = []
        phraseDict[phraseOffset].append(phrase)
    return phraseDict
Exemplo n.º 22
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            if Range.overlap(sentenceOffset, entityOffset):
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                entity.set("origOffset", entity.get("charOffset"))
                entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1])) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
            targetSentence = entSentence[interaction.get("e1")]
        else:
            targetSentence = entSentence[interaction.get("e2")]
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 23
0
def makeDETSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        if phraseBegin > 0 and tokens[phraseBegin - 1].get("POS") == "DT":
            newPhraseOffset = (
                Range.charOffsetToSingleTuple(tokens[phraseBegin - 1].get("charOffset"))[0],
                phraseOffset[1],
            )
            newPhrase = makePhrase("DT-" + phrase.get("type"), newPhraseOffset, phraseBegin - 1, phraseEnd)
            if not phraseDict.has_key(newPhraseOffset):
                # print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                newPhrases.append(newPhrase)
                phraseDict[newPhraseOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 24
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Exemplo n.º 25
0
 def getTokens(self, entity, tokenTuples):
     offset = entity.get("charOffset")
     assert offset != None
     offset = Range.charOffsetToSingleTuple(offset)
     match = []
     for tokenTuple in tokenTuples:
         if Range.overlap(offset, tokenTuple[0]):
             match.append(tokenTuple[1].get("text"))
         elif len(match) > 0:  # passed end
             break
     return match
Exemplo n.º 26
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 27
0
def makeTokenSubPhrases(tokens, phraseDict, includePOS=["PRP$", "IN", "WP$"]):
    newPhrases = []
    for i in range(len(tokens)):
        token = tokens[i]
        tokPOS = token.get("POS")
        if tokPOS in includePOS:
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if not phraseDict.has_key(tokOffset):
                newPhrase = makePhrase("TOK-t" + tokPOS, tokOffset, i, i)
                newPhrases.append(newPhrase)
                phraseDict[tokOffset] = [newPhrase]
    return newPhrases
Exemplo n.º 28
0
 def getPatterns(self, e1, e2):
     e1Range = Range.charOffsetToSingleTuple(e1.get("charOffset"))
     e2Range = Range.charOffsetToSingleTuple(e2.get("charOffset"))
     
     tokenPositions = {}
     for token in self.sentenceGraph.tokens:
         tokenPositions[token.get("id")] = self.getRelativePosition(e1Range,e2Range,token)
     
     prevTokenText = None
     prevToken2Text = None
     prevPosition = None
     patternForeBetween = {}
     patternBetween = {}
     patternBetweenAfter = {}
     for token in self.sentenceGraph.tokens:
         if self.sentenceGraph.tokenIsName[token]:
             continue
             
         id = token.get("id")
         text = token.get("text").lower()
         
         if prevPosition != tokenPositions[id]:
             prevTokenText = None
             prevToken2Text = None
         
         if tokenPositions[id] == "Fore":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "Between":
             self.addToPattern(patternForeBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetween, text, prevTokenText, prevToken2Text)
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         elif tokenPositions[id] == "After":
             self.addToPattern(patternBetweenAfter, text, prevTokenText, prevToken2Text)
         
         prevPosition = tokenPositions[id]
         #if tokenPositions[id].find("Entity") != -1:
         prevToken2Text = prevTokenText
         prevTokenText = text
 
     return patternForeBetween, patternBetween, patternBetweenAfter
Exemplo n.º 29
0
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1,-2,-3]:
                        entIndex = sText.find(entity.get("text"), charOffset[0]+i)
                        if entIndex != -1:
                            break
                if entIndex != 0: # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0,-1,-2,-3]:
                        lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                # move offset       
                charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]]
                assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
                entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
                entity.set("given", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
Exemplo n.º 30
0
    def writeXMLSentence(self,
                         examples,
                         predictionsByExample,
                         sentenceObject,
                         classSet,
                         classIds,
                         goldSentence=None,
                         exampleStyle=None,
                         structureAnalyzer=None):
        self.assertSameSentence(examples)

        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement,
                                           ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(
            sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)

        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            #entityElement.attrib["given"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"])
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Exemplo n.º 31
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0],entity2Range[0]),max(entity1Range[1],entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 32
0
 def getMetaMapFeatures(self, token, sentenceGraph, features):
     analyses = sentenceGraph.sentenceElement.find("analyses")
     if analyses == None:
         return
     metamap = analyses.find("metamap")
     if metamap == None:
         return
     tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     skipAttr = set(["charOffset", "text"])
     for phrase in metamap.findall("phrase"):
         phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
         if Range.overlap(tokenOffset, phraseOffset):
             attr = phrase.attrib
             attrNames = sorted(attr.keys())
             for attrName in attrNames:
                 if attrName in skipAttr:
                     continue
                 elif attrName == "score":
                     features["_metamap_score"] = 0.001 * abs(int(attr[attrName]))
                 else:
                     attrValues = attr[attrName].split(",")
                     for attrValue in attrValues: 
                         features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1
Exemplo n.º 33
0
 def getRelativePosition(self, entity1Range, entity2Range, token):
     offset = Range.charOffsetToSingleTuple(token.get("charOffset"))
     if Range.overlap(entity1Range, offset):
         return "Entity1"
     if Range.overlap(entity2Range, offset):
         return "Entity2"
     entitiesRange = (min(entity1Range[0], entity2Range[0]),
                      max(entity1Range[1], entity2Range[1]))
     if offset[1] < entitiesRange[0]:
         return "Fore"
     elif offset[1] > entitiesRange[1]:
         return "After"
     else:
         return "Between"
Exemplo n.º 34
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Exemplo n.º 35
0
def makeINSubPhrases(phrases, tokens, phraseDict, filter=None):
    newPhrases = []
    for phrase in phrases:
        if filter != None and phrase.get("type") not in filter:
            continue
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        prevToken = None
        tokCount = 0
        for token in tokens[phraseBegin : phraseEnd + 1]:
            if token.get("POS") == "IN" and prevToken != None:
                newPhraseOffset = (phraseOffset[0], Range.charOffsetToSingleTuple(prevToken.get("charOffset"))[-1])
                newPhrase = makePhrase(
                    phrase.get("type") + "-IN", newPhraseOffset, phraseBegin, phraseBegin + tokCount - 1
                )
                if not phraseDict.has_key(newPhraseOffset):
                    # print "NEW PHRASE:", ETUtils.toStr(newPhrase)
                    newPhrases.append(newPhrase)
                    phraseDict[newPhraseOffset] = [newPhrase]
            prevToken = token
            tokCount += 1
    return newPhrases
Exemplo n.º 36
0
def fixIndices(phrases, tokens):
    fixCount = 0
    phraseCount = 0
    for phrase in phrases:
        fixed = False
        phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset"))
        phraseBegin = int(phrase.get("begin"))
        phraseEnd = int(phrase.get("end"))
        for i in range(len(tokens)):
            token = tokens[i]
            tokOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
            if tokOffset[0] == phraseOffset[0]:
                if phraseBegin != i:
                    phrase.set("begin", str(i))
                    fixed = True
            if tokOffset[1] == phraseOffset[1]:
                if phraseEnd != i:
                    phrase.set("end", str(i))
                    fixed = True
                break
        if fixed:
            fixCount += 1
        phraseCount += 1
Exemplo n.º 37
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText):
                    headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0] -
                                        charOffset[0]:headOffset[1] -
                                        charOffset[0] + 1]
                if not headDict[eType].has_key(headText):
                    headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get(
                    "charOffset"
            ) in headOffsetStrings:  # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText):
                    headDict["None"][headText] = 0
                headDict["None"][headText] += 1

    return headDict
Exemplo n.º 38
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("given") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    # print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Exemplo n.º 39
0
def removeNamedEntityPhrases(entities, phrases, phraseDict):
    neOffsets = set()
    for entity in entities:
        if entity.get("given") != "True":
            continue
        neOffsets.add(entity.get("charOffset"))
    phrasesToKeep = []
    for phrase in phrases:
        phraseOffset = phrase.get("charOffset")
        if phraseOffset in neOffsets:
            phraseOffsetTuple = Range.charOffsetToSingleTuple(phraseOffset)
            if phraseOffsetTuple in phraseDict:
                del phraseDict[phraseOffsetTuple]
        else:
            phrasesToKeep.append(phrase)
    #print >> sys.stderr, "Removed", len(phrases) - len(phrasesToKeep), "named entity phrases"
    return phrasesToKeep
Exemplo n.º 40
0
def insertElements(corpus, specAnn):
    for document in corpus.iter('document'):
        docId = document.get("origId")
        assert docId in specAnn, docId
        for sentence in document.iter('sentence'):
            sentOffset = Range.charOffsetToSingleTuple(
                sentence.get("charOffset"))
            analyses = sentence.find("analyses")
            if not analyses:
                analyses = ET.SubElement(sentence, "analyses")
            #entitiesElement = sentence.find("entities")
            # Find the container
            container = analyses.find("entities")  #None
            #             for entitiesElement in entitiesElements:
            #                 if entitiesElement.get("source") == "SPECIES":
            #                     container = entitiesElement
            #                     break
            if not container:
                container = ET.SubElement(analyses, "entities")
            #container.set("source", "SPECIES")
            # Map the spans
            for span in specAnn[docId][:]:
                offset = span.get("offset")
                if Range.overlap(offset, sentOffset):
                    if sentOffset[0] > offset[0] or sentOffset[1] < offset[1]:
                        continue
                    specAnn[docId].remove(span)
                    charOffset = (offset[0] - sentOffset[0],
                                  offset[1] - sentOffset[0])
                    matchingText = sentence.get(
                        "text")[charOffset[0]:charOffset[1]]
                    spanText = span.get("text")
                    #print matchingText, spanText
                    assert matchingText == spanText, (matchingText, spanText,
                                                      charOffset)
                    span.set("charOffset",
                             "-".join([str(x) for x in charOffset]))
                    assert not "--" in span.get("charOffset"), [
                        str(x) for x in charOffset
                    ]
                    del span.attrib["offset"]  #span.set("offset", "")
                    container.append(span)
Exemplo n.º 41
0
    def writeXMLSentence(self, examples, predictionsByExample, sentenceObject, classSet, classIds, goldSentence=None, exampleStyle=None, structureAnalyzer=None):        
        self.assertSameSentence(examples)
        
        sentenceElement = sentenceObject.sentence
        sentenceId = sentenceElement.get("id")
        sentenceText = sentenceElement.get("text")
        # detach analyses-element
        sentenceAnalysesElement = None
        sentenceAnalysesElement = sentenceElement.find("sentenceanalyses")
        if sentenceAnalysesElement == None:
            sentenceAnalysesElement = sentenceElement.find("analyses")
        if sentenceAnalysesElement != None:
            sentenceElement.remove(sentenceAnalysesElement)
        # remove pairs and interactions
        interactions = self.removeChildren(sentenceElement, ["pair", "interaction"])
        # remove entities
        newEntityIdCount = IDUtils.getNextFreeId(sentenceElement.findall("entity"))
        nonNameEntities = self.removeNonNameEntities(sentenceElement)
            
        # add new pairs
        for example in examples:
            prediction = predictionsByExample[example[0]]
            entityElement = ET.Element("entity")
            #entityElement.attrib["given"] = "False"
            headToken = example[3]["t"]
            for token in sentenceObject.tokens:
                if token.get("id") == headToken:
                    headToken = token
                    break
            entityElement.set("charOffset", example[3]["charOffset"]) 
            entityElement.set("headOffset", headToken.get("charOffset"))
            entityElement.set("phraseType", example[3]["ptype"])
            entOffset = Range.charOffsetToSingleTuple(example[3]["charOffset"])
            entityElement.set("text", sentenceText[entOffset[0]:entOffset[1]])
            entityElement.set("id", sentenceId + ".e" + str(newEntityIdCount))
            self.setElementType(entityElement, prediction, classSet, classIds)
            newEntityIdCount += 1
            sentenceElement.append(entityElement)

        # re-attach the analyses-element
        if sentenceAnalysesElement != None:
            sentenceElement.append(sentenceAnalysesElement)
Exemplo n.º 42
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = []  # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens) == 1:  # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else:  # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 43
0
def getEntityHeadToken(entity, tokens, tokenHeadScores):
    if entity.get("headOffset") != None:
        charOffsets = Range.charOffsetToTuples(entity.get("headOffset"))
    elif entity.get("charOffset") != "":
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
    else:
        charOffsets = []
    # Each entity can consist of multiple syntactic tokens, covered by its
    # charOffset-range. One of these must be chosen as the head token.
    headTokens = [] # potential head tokens
    for token in tokens:
        tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset"))
        for offset in charOffsets:
            if Range.overlap(offset, tokenOffset):
                headTokens.append(token)
    if len(headTokens)==1: # An unambiguous head token was found
        selectedHeadToken = headTokens[0]
    else: # One head token must be chosen from the candidates
        selectedHeadToken = findHeadToken(headTokens, tokenHeadScores)
        #if verbose:
        #    print >> sys.stderr, "Selected head:", token.attrib["id"], token.attrib["text"]
    assert selectedHeadToken != None, entityElement.get("id")
    return selectedHeadToken
Exemplo n.º 44
0
 def addSentence(self, sentenceGraph):
     if sentenceGraph == None:
         return
     tokens = sorted([(Range.charOffsetToSingleTuple(x.get("charOffset")),
                       x) for x in sentenceGraph.tokens])
     indexByTokenId = {
         tokens[i][1].get("id"): i
         for i in range(len(tokens))
     }
     assert len(indexByTokenId) == len(
         tokens)  # check that there were no duplicate ids
     entityById = {x.get("id"): x for x in sentenceGraph.entities}
     events = {}
     for interaction in sentenceGraph.interactions:
         e1Id = interaction.get("e1")
         e2Id = interaction.get("e2")
         e1 = entityById[e1Id]
         e2 = entityById[e2Id]
         t1 = sentenceGraph.entityHeadTokenByEntity[e1]
         t2 = sentenceGraph.entityHeadTokenByEntity[e2]
         index1 = indexByTokenId[t1.get("id")]
         index2 = indexByTokenId[t2.get("id")]
         intSpan = abs(index1 - index2)
         self.interactionSpans[intSpan] = self.interactionSpans.get(
             intSpan, 0) + 1
         self.intSpan["min"] = min(self.intSpan.get("min"), intSpan)
         self.intSpan["max"] = max(self.intSpan.get("max"), intSpan)
         if interaction.get("event") == "True":
             if e1Id not in events:
                 events[e1Id] = {"min": 9999, "max": -9999}
             events[e1Id]["min"] = min(events[e1Id]["min"], index1, index2)
             events[e1Id]["max"] = max(events[e1Id]["max"], index1, index2)
     for eventId in sorted(events.keys()):
         eventSpan = events[eventId]["max"] - events[eventId]["min"]
         self.eventSpans[eventSpan] = self.eventSpans.get(eventSpan, 0) + 1
         self.eventSpan["min"] = min(self.eventSpan.get("min"), eventSpan)
         self.eventSpan["max"] = max(self.eventSpan.get("max"), eventSpan)
Exemplo n.º 45
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(
            1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i]
                altOffsets[i] = (altOffset[0] - sentOffset[0],
                                 altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1

    print >> sys.stderr, "Fixed", fixCount, "altOffsets"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 46
0
def findHeadsDictionary(corpus, stringsFrom, parse, tokenization):
    print "Extracting triggers from", stringsFrom
    trigDict = getTriggers(stringsFrom)
    print "Determining trigger distribution"
    distDict = getDistribution(trigDict)
    allStrings = sorted(distDict.keys())
    print "Determining heads for", corpus
    corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus(corpus, parse, tokenization, removeIntersentenceInteractions=False, removeNameInfo=False)
    cases = {}
    counts = [0,0]
    for sentence in corpusElements.sentences:
        #print sentence.sentence.get("id")
        sText = sentence.sentence.get("text")
        #tokenHeadScores = None
        for entity in sentence.entities:
            if entity.get("headOffset") != None:
                continue
            if entity.get("isName") == "True": # Only for triggers
                continue
            #if tokenHeadScores == None:
            #    tokenHeadScores = getTokenHeadScores(sentence.tokens, sentence.dependencies, sentenceId=sentence.sentence.get("id"))
            eText = entity.get("text")
            eType = entity.get("type")
            eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            wsSplits = eText.split() # Split by whitespace
            if len(wsSplits) == 1 and eText.find("-") == -1: # unambiguous head will be assigned by SYNTAX pass
                continue
            else: # Entity text has multiple (whitespace or hyphen separated) parts
                candidates = []
                # Try to find entity substring in individual entity strings
                for wsTuple in mapSplits(wsSplits, eText, eOffset):
                    if not distDict.has_key(wsTuple[1]): # string not found, low score
                        candidates.append( ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1]) )
                    else: # String found, more common ones get higher score
                        assert distDict[wsTuple[1]].has_key(eType), (distDict[wsTuple[0]], wsTuple[0], eText)
                        candidates.append( (tuple(distDict[wsTuple[1]][eType]), wsTuple[2], wsTuple[0], wsTuple[1]) )
                # Split each whitespace-separated string further into hyphen-separated substrings
                for candidate in candidates[:]:
                    hyphenSplits = candidate[2].split("-")
                    if len(hyphenSplits) > 1: # Substring has a hyphen
                        # Try to find entity substring in individual entity strings
                        for hyphenTuple in mapSplits(hyphenSplits, eText, candidate[1]):
                            if not distDict.has_key(hyphenTuple[1]):
                                candidates.append( ((-1, -1), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) )
                            else:
                                candidates.append( (tuple(distDict[hyphenTuple[1]][eType]), hyphenTuple[2], hyphenTuple[0], hyphenTuple[1]) )
            # Sort candidates, highes scores come first
            candidates.sort(reverse=True)
            # If not matches, look for substrings inside words
            if candidates[0][0][0] in [-1, 0]: # no matches, look for substrings
                print "Substring matching", candidates, "for entity", entity.get("id")
                for i in range(len(candidates)):
                    candidate = candidates[i]
                    cText = candidate[2]
                    for string in allStrings:
                        subStringPos = cText.find(string)
                        if subStringPos != -1:
                            print "  Substring match", string, cText,
                            score = tuple(distDict[string][eType])
                            if score > candidate[0]:
                                print score, candidate[0], "Substring selected" #, score > candidate[0], score < candidate[0]
                                subStringCoords = [candidate[1][0] + subStringPos, len(string)]
                                candidate = (score, subStringCoords, candidate[2], ">"+string+"<")
                            else:
                                print score, candidate[0]
                    candidates[i] = candidate
                # Resort after possibly replacing some candidates
                candidates.sort(reverse=True)
            if candidates[0][0][0] not in [-1, 0]: # if it is in [-1, 0], let SYNTAX pass take care of it
                candidateOffset = (candidates[0][1][0] + eOffset[0], candidates[0][1][0] + candidates[0][1][1] + eOffset[0]) 
                entity.set("headOffset", str(candidateOffset[0]) + "-" + str(candidateOffset[1]-1))
                entity.set("headMethod", "Dict")
                entity.set("headString", sText[candidateOffset[0]:candidateOffset[1]])
                counts[0] += 1
            # Prepare results for printing
            for i in range(len(candidates)):
                c = candidates[i]
                candidates[i] = (tuple(c[0]), c[2], c[3])
            case = (eType, eText, tuple(candidates))
            if not cases.has_key(case):
                cases[case] = 0
            cases[case] += 1
            print entity.get("id"), eType + ": '" + eText + "'", candidates    
            #headToken = getEntityHeadToken(entity, sentence.tokens, tokenHeadScores)
            # The ElementTree entity-element is modified by setting the headOffset attribute
            #entity.set("headOffset", headToken.get("charOffset"))
            #entity.set("headMethod", "Syntax")
    print "Cases"
    for case in sorted(cases.keys()):
        print case, cases[case]
    #return corpus
    return counts
Exemplo n.º 47
0
    def mapInteractions(self,
                        entityElements,
                        interactionElements,
                        verbose=False):
        """
        Maps the semantic interactions to the syntactic graph.

        Syntactic dependencies are defined between tokens. Semantic edges (interactions)
        are defined between annotated entities. To utilize the correlation of the dependency
        parse with the semantic interactions, the graphs must be aligned by mapping the
        interaction graph's nodes (entities) to the syntactic graph's nodes (tokens). This
        is done by determining the head tokens of the entities.

        @param entityElements: the semantic nodes (triggers and named entities)
        @type entityElements: list of cElementTree.Element objects
        @param interactionElements: the semantic edges (e.g. Cause and Theme for GENIA)
        @type interactionElements: list of cElementTree.Element objects
        @param verbose: Print selected head tokens on screen
        @param verbose: boolean

        Duplicated interactions are skipped in this function. For all gold interactions between two tokens,
        it only keeps one interaction for each interactions type.
        """
        self.interactions = interactionElements
        self.entities = entityElements
        # Entities that have no text binding can not be mapped and are therefore removed
        for entity in self.entities[:]:
            if entity.get("charOffset") == "":
                self.entities.remove(entity)
        #self.interactionGraph = NX.XDiGraph(multiedges = multiedges)
        #if multiedges:
        #    self.interactionGraph = NX10.MultiDiGraph()
        #else:
        #    self.interactionGraph = NX10.DiGraph()
        self.interactionGraph = Graph()
        self.interactionGraph.addNodes(self.tokens)
        #for token in self.tokens:
        #    self.interactionGraph.add_node(token)

        self.entitiesByToken = {}  # a mapping for fast access
        self.entitiesById = {}
        self.entityHeadTokenByEntity = {}
        sentenceSpan = (0, len(self.sentenceElement.get("text"))
                        )  # for validating the entity offsets
        for entity in self.entities[:]:
            headToken = self.mapEntity(entity, verbose)
            if entity.tag != "entity":
                self.entities.remove(entity)
            elif headToken != None:
                self.entityHeadTokenByEntity[entity] = headToken
                self.entitiesById[entity.get("id")] = entity
            else:
                # Check that the entity is within the sentence
                if not Range.overlap(
                        Range.charOffsetToSingleTuple(
                            entity.get("charOffset")), sentenceSpan):
                    raise Exception("Entity " + entity.get("id") +
                                    ", charOffset " +
                                    entity.get("charOffset") +
                                    ", does not overlap with sentence " +
                                    self.sentenceElement.get("id") +
                                    ", length " + str(sentenceSpan[1]))
                # Assume there simply is no token corresponding to the entity
                self.entities.remove(entity)
        self._markNamedEntities()

        for interaction in self.interactions:

            if (not self.entitiesById.has_key(interaction.get("e1"))
                ):  #and self.entitiesById.has_key(interaction.get("e2")):
                continue  # e1 is outside of this sentence
                # assign the token1 to whatever the entity id (key) as a placeholder - to test the interaction statistics
                # token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
                # token2 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e2")]]
            if (not self.entitiesById.has_key(interaction.get("e2"))
                ):  #and self.entitiesById.has_key(interaction.get("e1")):
                continue  # e2 is outside of this sentence
                # token1 = self.entityHeadTokenByEntity[self.entitiesById[interaction.get("e1")]]
                # token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
            if self.entitiesById.has_key(
                    interaction.get("e1")) and self.entitiesById.has_key(
                        interaction.get("e2")):
                token1 = self.entityHeadTokenByEntity[self.entitiesById[
                    interaction.get("e1")]]
                token2 = self.entityHeadTokenByEntity[self.entitiesById[
                    interaction.get("e2")]]
            # else:
            #     token1 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]
            #     token2 = self.entityHeadTokenByEntity[self.entitiesById[self.entitiesById.keys()[0]]]

#            found = False
#            if multiedges:
#                edges = self.interactionGraph.get_edge_data(token1, token2, default={})
#                for i in range(len(edges)):
#                    edge = edges[i]["element"]
#                    if edge.attrib["type"] == interaction.attrib["type"]:
#                        found = True
#                        break
#            if not found:
#                self.interactionGraph.add_edge(token1, token2, element=interaction)
#            else:
#                self.duplicateInteractionEdgesRemoved += 1
            found = False
            edges = self.interactionGraph.getEdges(token1, token2)
            for edge in edges:
                if edge[2].get("type") == interaction.get("type"):
                    found = True
                    break
            if not found:
                self.interactionGraph.addEdge(token1, token2, interaction)
            else:
                # TODO: "skipped" would be better than "removed"
                self.duplicateInteractionEdgesRemoved += 1
Exemplo n.º 48
0
 def prepareTokens(self, tokens):
     tokenTuples = []
     for token in tokens:
         tokenTuples.append((Range.charOffsetToSingleTuple(token.get("charOffset")), token))
     return tokenTuples
Exemplo n.º 49
0
    def mapEntity(self, entityElement, verbose=False):
        """
        Determine the head token for a named entity or trigger. The head token is the token closest
        to the root for the subtree of the dependency parse spanned by the text of the element.

        @param entityElement: a semantic node (trigger or named entity)
        @type entityElement: cElementTree.Element
        @param verbose: Print selected head tokens on screen
        @param verbose: boolean
        """
        headOffset = None
        if entityElement.get("headOffset") != None:
            headOffset = Range.charOffsetToSingleTuple(
                entityElement.get("headOffset"))
        if entityElement.get("charOffset") != "":
            charOffsets = Range.charOffsetToTuples(
                entityElement.get("charOffset"))
        else:
            charOffsets = []
        # Each entity can consist of multiple syntactic tokens, covered by its
        # charOffset-range. One of these must be chosen as the head token.
        headTokens = []  # potential head tokens
        for token in self.tokens:
            #print token.attrib["id"], token.attrib["charOffset"]
            tokenOffset = Range.charOffsetToSingleTuple(
                token.get("charOffset"))
            if headOffset != None and entityElement.get("type") != "Binding":
                # A head token can already be defined in the headOffset-attribute.
                # However, depending on the tokenization, even this range may
                # contain multiple tokens. Still, it can always be assumed that
                # if headOffset is defined, the corret head token is in this range.
                if Range.overlap(headOffset, tokenOffset):
                    headTokens.append(token)
            else:
                for offset in charOffsets:
                    if Range.overlap(offset, tokenOffset):
                        headTokens.append(token)
        if len(headTokens) == 1:  # An unambiguous head token was found
            token = headTokens[0]
        else:  # One head token must be chosen from the candidates
            selHead = None
            if entityElement.get("type") == "Binding":
                for t in headTokens:
                    compText = t.get("text").lower()
                    for bindWord in ("bind", "complex", "h**o", "hetero",
                                     "dimer"):
                        if bindWord in compText:
                            selHead = t
                            break
                    if selHead != None:
                        break
#                     if compText.find("bind") != -1 or compText.find("complex") != -1:
#                         selHead = t
#                         #print "Head:", selHead.get("text"), "/", entityElement.get("text"), entityElement.get("headOffset"), selHead.get("charOffset")
#                         entityElement.set("headOffset", selHead.get("charOffset"))
#                         break
#             elif "egulation" in entityElement.get("type"):
#                 self.getTokenHeadScores()
#                 regulationHeads = [x for x in headTokens if self.tokenHeadScores[x] >= 1]
#                 if len(regulationHeads) > 0:
#                     selHead = regulationHeads[-1]
            if selHead == None:
                token = self.findHeadToken(headTokens)
            else:
                token = selHead
            if verbose:
                print >> sys.stderr, "Selected head:", token.get(
                    "id"), token.get("text")
        #assert token != None, entityElement.get("id")
        if token != None:
            # The ElementTree entity-element is modified by setting the headOffset attribute
            if entityElement.get("headOffset") == None or entityElement.get(
                    "headOffset") != token.get("charOffset"):
                entityElement.set("headOffset", token.get("charOffset"))
            if not self.entitiesByToken.has_key(token):
                self.entitiesByToken[token] = []
            self.entitiesByToken[token].append(entityElement)
        else:
            print >> sys.stderr, "Warning, no tokens for entity", entityElement.get(
                "id")
        return token
Exemplo n.º 50
0
def findHeadsDictionary(corpus, stringsFrom, parse, tokenization):
    print "Extracting triggers from", stringsFrom
    trigDict = getTriggers(stringsFrom)
    print "Determining trigger distribution"
    distDict = getDistribution(trigDict)
    allStrings = sorted(distDict.keys())
    print "Determining heads for", corpus
    corpusElements = Utils.InteractionXML.CorpusElements.loadCorpus(
        corpus,
        parse,
        tokenization,
        removeIntersentenceInteractions=False,
        removeNameInfo=False)
    cases = {}
    counts = [0, 0]
    for sentence in corpusElements.sentences:
        #print sentence.sentence.get("id")
        sText = sentence.sentence.get("text")
        #tokenHeadScores = None
        for entity in sentence.entities:
            if entity.get("headOffset") != None:
                continue
            if entity.get("given") == "True":  # Only for triggers
                continue
            #if tokenHeadScores == None:
            #    tokenHeadScores = getTokenHeadScores(sentence.tokens, sentence.dependencies, sentenceId=sentence.sentence.get("id"))
            eText = entity.get("text")
            eType = entity.get("type")
            eOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            wsSplits = eText.split()  # Split by whitespace
            if len(wsSplits) == 1 and eText.find(
                    "-"
            ) == -1:  # unambiguous head will be assigned by SYNTAX pass
                continue
            else:  # Entity text has multiple (whitespace or hyphen separated) parts
                candidates = []
                # Try to find entity substring in individual entity strings
                for wsTuple in mapSplits(wsSplits, eText, eOffset):
                    if not distDict.has_key(
                            wsTuple[1]):  # string not found, low score
                        candidates.append(
                            ((-1, -1), wsTuple[2], wsTuple[0], wsTuple[1]))
                    else:  # String found, more common ones get higher score
                        assert distDict[wsTuple[1]].has_key(eType), (
                            distDict[wsTuple[0]], wsTuple[0], eText)
                        candidates.append((tuple(distDict[wsTuple[1]][eType]),
                                           wsTuple[2], wsTuple[0], wsTuple[1]))
                # Split each whitespace-separated string further into hyphen-separated substrings
                for candidate in candidates[:]:
                    hyphenSplits = candidate[2].split("-")
                    if len(hyphenSplits) > 1:  # Substring has a hyphen
                        # Try to find entity substring in individual entity strings
                        for hyphenTuple in mapSplits(hyphenSplits, eText,
                                                     candidate[1]):
                            if not distDict.has_key(hyphenTuple[1]):
                                candidates.append(
                                    ((-1, -1), hyphenTuple[2], hyphenTuple[0],
                                     hyphenTuple[1]))
                            else:
                                candidates.append(
                                    (tuple(distDict[hyphenTuple[1]][eType]),
                                     hyphenTuple[2], hyphenTuple[0],
                                     hyphenTuple[1]))
            # Sort candidates, highes scores come first
            candidates.sort(reverse=True)
            # If not matches, look for substrings inside words
            if candidates[0][0][0] in [-1,
                                       0]:  # no matches, look for substrings
                print "Substring matching", candidates, "for entity", entity.get(
                    "id")
                for i in range(len(candidates)):
                    candidate = candidates[i]
                    cText = candidate[2]
                    for string in allStrings:
                        subStringPos = cText.find(string)
                        if subStringPos != -1:
                            print "  Substring match", string, cText,
                            score = tuple(distDict[string][eType])
                            if score > candidate[0]:
                                print score, candidate[
                                    0], "Substring selected"  #, score > candidate[0], score < candidate[0]
                                subStringCoords = [
                                    candidate[1][0] + subStringPos,
                                    len(string)
                                ]
                                candidate = (score, subStringCoords,
                                             candidate[2], ">" + string + "<")
                            else:
                                print score, candidate[0]
                    candidates[i] = candidate
                # Resort after possibly replacing some candidates
                candidates.sort(reverse=True)
            if candidates[0][0][0] not in [
                    -1, 0
            ]:  # if it is in [-1, 0], let SYNTAX pass take care of it
                candidateOffset = (candidates[0][1][0] + eOffset[0],
                                   candidates[0][1][0] + candidates[0][1][1] +
                                   eOffset[0])
                entity.set(
                    "headOffset",
                    str(candidateOffset[0]) + "-" +
                    str(candidateOffset[1] - 1))
                entity.set("headMethod", "Dict")
                entity.set("headString",
                           sText[candidateOffset[0]:candidateOffset[1]])
                counts[0] += 1
            # Prepare results for printing
            for i in range(len(candidates)):
                c = candidates[i]
                candidates[i] = (tuple(c[0]), c[2], c[3])
            case = (eType, eText, tuple(candidates))
            if not cases.has_key(case):
                cases[case] = 0
            cases[case] += 1
            print entity.get("id"), eType + ": '" + eText + "'", candidates
Exemplo n.º 51
0
def addEntitiesToSTDoc(doc, docElement, tMap, eMap, entityElementMap, useOrigIds=False):
    containerElements = [docElement] + [x for x in docElement.getiterator("sentence")]
    for containerElement in containerElements:
        for entity in containerElement.findall("entity"):
            eType = entity.get("type")
            if eType == "neg": # skip negative predictions if they are present
                continue
            assert entity.get("id") != None
            entityElementMap[entity.get("id")] = entity
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            ann = Annotation()
            ann.type = eType
            if useOrigIds:
                entityOrigId = entity.get("origId")
                if entityOrigId != None and entityOrigId.find(".") != -1: # fix gluing of doc and ann id
                    entityOrigId = entityOrigId.rsplit(".",1)[-1]
                if entityOrigId != None:
                    if entityOrigId[0] == "E": # a special id denoting a numbered, but triggerless event
                        ann.eventId = entityOrigId
                        ann.id = None
                    else:
                        ann.id = entityOrigId
            ann.text = entity.get("text")
            if entity.get("normalization") != None:
                ann.normalization = entity.get("normalization")
            #assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset)
            ann.charOffsets = entityOffsets
            #ann.charBegin = entityOffset[0]
            #ann.charEnd = entityOffset[0] + len(ann.text) # entityOffset[1] + 1
            if containerElement.tag == "sentence": # entity offset is relative to the container element, and for sentences, they can be relative to the document
                sentenceOffset = Range.charOffsetToSingleTuple(containerElement.get("charOffset"))
                for i in range(len(ann.charOffsets)):
                    ann.charOffsets[i] = (ann.charOffsets[i][0] + sentenceOffset[0], ann.charOffsets[i][1] + sentenceOffset[0]) 
                #ann.charBegin += sentenceOffset[0]
                #ann.charEnd += sentenceOffset[0]
#            idStem = entity.get("id").split(".e", 1)[0]
#            if sentenceOffsets.has_key(idStem):
#                sentenceOffset = sentenceOffsets[idStem]
#                ann.charBegin += sentenceOffset[0]
#                ann.charEnd += sentenceOffset[0]
            if entity.get("speculation") == "True":
                ann.speculation = True
            if entity.get("negation") == "True":
                ann.negation = True
            ann.extra = getExtraFromElement(entity) # add all scores and extra data
            if entity.get("given") == "True":
                # Remember to use original id for names!
                if entity.get("origId") != None:
                    ann.id = entity.get("origId").rsplit(".", 1)[-1]
                    assert ann.id[0].isupper(), ann.id
                    for c in ann.id[1:]:
                        assert c.isdigit(), ann.id
                doc.proteins.append(ann)
                tMap[entity.get("id")] = ann
                # The part below is dangerous, and incompatibilities should be handled rather
                # by not converting to the shared task format when it cannot be done 
                #if entity.get("origId") != None:
                #    # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format
                #    nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1]
                #    if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit():
                #        ann.id = nonNamedEntityOrigId
                #stDoc.proteins.append(ann)
            else: # a predicted protein or trigger
                duplicateAnn = findDuplicateForSTTrigger(ann, doc.triggers)
                if duplicateAnn == None:
                    doc.triggers.append(ann)
                    tMap[entity.get("id")] = ann
                    # Add confidence scores
                    #ann.extra = getExtraFromElement(entity, ["conf"])
                    #ann.triggerScores = entity.get("predictions")
                    #ann.unmergingScores = entity.get("umStrength")
                    #ann.speculationScores = entity.get("modPred")
                    #ann.negationScores = entity.get("modPred")
                    # Events with 0 interactions (such as some Process-type events) would not be formed when constructing events based on interactions
                    if entity.get("event") == "True":
                        event = makeSTEvent(ann, entityElementMap[entity.get("id")])
                        eMap[entity.get("id")] = event
                        doc.events.append(event)
                else: # a duplicate trigger already exists
                    tMap[entity.get("id")] = duplicateAnn
Exemplo n.º 52
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
                        newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception("Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
Exemplo n.º 53
0
def orderTokens(token1, token2):
    offset1 = Range.charOffsetToSingleTuple(token1.get("charOffset"))
    offset2 = Range.charOffsetToSingleTuple(token1.get("charOffset"))
    return Range.order(offset1, offset2)
Exemplo n.º 54
0
    def buildExample(self,
                     token1,
                     token2,
                     paths,
                     sentenceGraph,
                     categoryName,
                     entity1=None,
                     entity2=None,
                     structureAnalyzer=None,
                     isDirected=True):
        """
        Build a single directed example for the potential edge between token1 and token2
        """
        # define features
        features = {}
        if not self.styles["no_path"]:
            path = paths.getPaths(token1, token2)
            if len(path) > 0:
                path = path[0]
                pathExists = True
            else:
                path = [token1, token2]
                pathExists = False
        else:
            path = [token1, token2]
            pathExists = False

        if not self.styles["no_trigger_features"]:  # F 85.52 -> 85.55
            self.triggerFeatureBuilder.setFeatureVector(features)
            self.triggerFeatureBuilder.tag = "trg1_"
            self.triggerFeatureBuilder.buildFeatures(token1)
            self.triggerFeatureBuilder.tag = "trg2_"
            self.triggerFeatureBuilder.buildFeatures(token2)
            self.triggerFeatureBuilder.setFeatureVector(None)
        # REL features
        if self.styles["rel_features"] and not self.styles["no_task"]:
            self.relFeatureBuilder.setFeatureVector(features)
            self.relFeatureBuilder.tag = "rel1_"
            self.relFeatureBuilder.buildAllFeatures(
                sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
            self.relFeatureBuilder.tag = "rel2_"
            self.relFeatureBuilder.buildAllFeatures(
                sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
            self.relFeatureBuilder.setFeatureVector(None)
        if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
            self.bacteriaRenamingFeatureBuilder.buildPairFeatures(
                entity1, entity2)
            #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
        if self.styles["co_features"] and not self.styles["no_task"]:
            e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
            e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
            if Range.contains(e1Offset, e2Offset):
                features[self.featureSet.getId("e1_contains_e2")] = 1
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("e1_contains_e2name")] = 1
            if Range.contains(e2Offset, e1Offset):
                features[self.featureSet.getId("e2_contains_e1")] = 1
                if entity1.get("given") == "True":
                    features[self.featureSet.getId("e2_contains_e1name")] = 1
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder.setFeatureVector(features)
            self.drugFeatureBuilder.tag = "ddi_"
            self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)
            if self.styles["ddi_mtmx"]:
                self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
            self.drugFeatureBuilder.setFeatureVector(None)
        if self.styles["graph_kernel"]:
            self.graphKernelFeatureBuilder.setFeatureVector(
                features, entity1, entity2)
            self.graphKernelFeatureBuilder.buildGraphKernelFeatures(
                sentenceGraph, path)
            self.graphKernelFeatureBuilder.setFeatureVector(None)
        if self.styles["entity_type"]:
            e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1)
            e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2)
            features[self.featureSet.getId("e1_" + e1Type)] = 1
            features[self.featureSet.getId("e2_" + e2Type)] = 1
            features[self.featureSet.getId("distance_" + str(len(path)))] = 1
        if not self.styles["no_dependency"]:
            #print "Dep features"
            self.multiEdgeFeatureBuilder.setFeatureVector(
                features, entity1, entity2)
            #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
            if not self.styles["disable_entity_features"]:
                self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
            if not self.styles["disable_terminus_features"]:
                self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(
                    path, sentenceGraph)  # remove for fast
            if not self.styles["disable_single_element_features"]:
                self.multiEdgeFeatureBuilder.buildSingleElementFeatures(
                    path, sentenceGraph)
            if not self.styles["disable_ngram_features"]:
                #print "NGrams"
                self.multiEdgeFeatureBuilder.buildPathGrams(
                    2, path, sentenceGraph)  # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(
                    3, path, sentenceGraph)  # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(
                    4, path, sentenceGraph)  # remove for fast
            #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
            #if edges != None:
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
            if not self.styles["disable_path_edge_features"]:
                self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(
                    path, sentenceGraph)
            self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.setFeatureVector(None)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder.setFeatureVector(
                features, entity1, entity2)
            shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(
                sentenceGraph.dependencyGraph, path)
            print shortestPaths
            if len(shortestPaths) > 0:
                self.nodalidaFeatureBuilder.buildNGrams(
                    shortestPaths, sentenceGraph)
            self.nodalidaFeatureBuilder.setFeatureVector(None)
        if self.styles["linear_features"]:
            self.tokenFeatureBuilder.setFeatureVector(features)
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == token1:
                    token1Index = i
                if sentenceGraph.tokens[i] == token2:
                    token2Index = i
            linearPreTag = "linfw_"
            if token1Index > token2Index:
                token1Index, token2Index = token2Index, token1Index
                linearPreTag = "linrv_"
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index,
                                                              sentenceGraph,
                                                              2,
                                                              2,
                                                              preTag="linTok1")
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index,
                                                              sentenceGraph,
                                                              2,
                                                              2,
                                                              preTag="linTok2")
            # Before, middle, after
            #                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
            #                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
            #                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
            # before-middle, middle, middle-after
            #                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
            #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
            #                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
            self.tokenFeatureBuilder.setFeatureVector(None)
        if self.styles["random"]:
            self.randomFeatureBuilder.setFeatureVector(features)
            self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
            self.randomFeatureBuilder.setFeatureVector(None)
        if self.styles["genia_features"] and not self.styles["no_task"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            assert (entity1.get("given") in (None, "False"))
            if entity2.get("given") == "True":
                features[self.featureSet.getId("GENIA_target_protein")] = 1
            else:
                features[self.featureSet.getId("GENIA_nested_event")] = 1
            if e1Type.find(
                    "egulation"
            ) != -1:  # leave r out to avoid problems with capitalization
                if entity2.get("given") == "True":
                    features[self.featureSet.getId(
                        "GENIA_regulation_of_protein")] = 1
                else:
                    features[self.featureSet.getId(
                        "GENIA_regulation_of_event")] = 1
        if self.styles["bi_features"]:
            # Make features based on entity types
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            e1SuperType = str(self.getBISuperType(e1Type))
            e2SuperType = str(self.getBISuperType(e2Type))
            features[self.featureSet.getId("BI_e1_" + e1Type)] = 1
            features[self.featureSet.getId("BI_e2_" + e2Type)] = 1
            features[self.featureSet.getId("BI_e1sup_" + e1SuperType)] = 1
            features[self.featureSet.getId("BI_e2sup_" + e2SuperType)] = 1
            features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" +
                                           e2Type)] = 1
            features[self.featureSet.getId("BI_e1e2sup_" + e1SuperType + "_" +
                                           e2SuperType)] = 1
        if self.styles["evex"]:
            self.evexFeatureBuilder.setFeatureVector(features, entity1,
                                                     entity2)
            self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1,
                                                      token2, path,
                                                      sentenceGraph)
            self.evexFeatureBuilder.setFeatureVector(None)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder.setFeatureVector(
                features, entity1, entity2)
            self.giulianoFeatureBuilder.buildEdgeFeatures(
                entity1, entity2, token1, token2, path, sentenceGraph)
            self.giulianoFeatureBuilder.setFeatureVector(None)

        # define extra attributes
        if int(path[0].get("charOffset").split("-")[0]) < int(
                path[-1].get("charOffset").split("-")[0]):
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[0].get("id"),
                "t2": path[-1].get("id")
            }
            extra["deprev"] = False
        else:
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[-1].get("id"),
                "t2": path[0].get("id")
            }
            extra["deprev"] = True
        if entity1 != None:
            extra["e1"] = entity1.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e1DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity1]
                ])
        if entity2 != None:
            extra["e2"] = entity2.get("id")
            if sentenceGraph.mergedEntityToDuplicates != None:
                extra["e2DuplicateIds"] = ",".join([
                    x.get("id")
                    for x in sentenceGraph.mergedEntityToDuplicates[entity2]
                ])
        extra["categoryName"] = categoryName
        if self.styles["bacteria_renaming"]:
            if entity1.get("text") != None and entity1.get("text") != "":
                extra["e1t"] = entity1.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
            if entity2.get("text") != None and entity2.get("text") != "":
                extra["e2t"] = entity2.get("text").replace(" ", "---").replace(
                    ":", "-COL-")
        sentenceOrigId = sentenceGraph.sentenceElement.get("origId")
        if sentenceOrigId != None:
            extra["SOID"] = sentenceOrigId
        extra["directed"] = str(isDirected)

        return (categoryName, features, extra)
Exemplo n.º 55
0
def toSTFormat(
    input, output=None, outputTag="a2", useOrigIds=False, debug=False, task=2, validate=True, writeScores=False
):
    print >>sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >>sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    nonEntitySiteCount = 0
    documents = []
    for document in corpusRoot.findall("document"):
        stDoc = Document()
        stDoc.proteins = []
        stDoc.triggers = []
        stDoc.events = []
        stDoc.relations = []
        stDoc.id = document.get("pmid")
        if stDoc.id == None:
            stDoc.id = document.get("origId")
        stDoc.text = ""
        documents.append(stDoc)
        eMap = {}
        tMap = {}
        siteMap = {}
        siteScores = {}
        sites = []
        sentenceOffsets = {}
        for sentence in document.findall("sentence"):
            head = sentence.get("head")
            if head != None:
                stDoc.text += head
            stDoc.text += sentence.get("text")
            tail = sentence.get("tail")
            if tail != None:
                stDoc.text += tail
            sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
            sentenceOffsets[sentence.get("id")] = sentenceOffset
            if stDoc.id == None:
                stDoc.id = sentence.get("origId").rsplit(".", 1)[0]
        entityElementMap = {}  # for task 3
        for entity in document.getiterator("entity"):
            eType = entity.get("type")
            if eType == "neg":
                continue
            entityElementMap[entity.get("id")] = entity
            entityOffset = Range.charOffsetToSingleTuple(entity.get("charOffset"))
            ann = Annotation()
            ann.type = eType
            if useOrigIds:
                entityOrigId = entity.get("origId")
                if entityOrigId != None and entityOrigId.find(".") != -1:  # fix gluing of doc and ann id
                    entityOrigId = entityOrigId.rsplit(".", 1)[-1]
                if entityOrigId != None:
                    if entityOrigId[0] == "E":  # a special id denoting a numbered, but triggerless event
                        ann.eventId = entityOrigId
                        ann.id = None
                    else:
                        ann.id = entityOrigId
            ann.text = entity.get("text")
            assert entityOffset[1] - entityOffset[0] in [len(ann.text), len(ann.text) - 1], (ann.text, entityOffset)
            ann.charBegin = entityOffset[0]
            ann.charEnd = entityOffset[0] + len(ann.text)  # entityOffset[1] + 1
            idStem = entity.get("id").split(".e", 1)[0]
            if sentenceOffsets.has_key(idStem):
                sentenceOffset = sentenceOffsets[idStem]
                ann.charBegin += sentenceOffset[0]
                ann.charEnd += sentenceOffset[0]
            if entity.get("speculation") == "True":
                ann.speculation = True
            if entity.get("negation") == "True":
                ann.negation = True
            if entity.get("isName") == "True":
                # Remember to use original id for names!
                if entity.get("origId") != None:
                    ann.id = entity.get("origId").rsplit(".", 1)[-1]
                    assert ann.id[0].isupper(), ann.id
                    for c in ann.id[1:]:
                        assert c.isdigit(), ann.id
                stDoc.proteins.append(ann)
                # The part below is dangerous, and incompatibilities should be handled rather
                # by not converting to the shared task format when it cannot be done
                # if entity.get("origId") != None:
                #    # Attempt to process origId, assuming it corresponds to the BioNLP Shared Task format
                #    nonNamedEntityOrigId = entity.get("origId").rsplit(".", 1)[-1]
                #    if len(nonNamedEntityOrigId) > 1 and nonNamedEntityOrigId[0].isupper() and nonNamedEntityOrigId[1:].isdigit():
                #        ann.id = nonNamedEntityOrigId
                # stDoc.proteins.append(ann)
            else:
                found = False  # prevent duplicate triggers
                for trigger in stDoc.triggers:
                    if (
                        trigger.charBegin == ann.charBegin
                        and trigger.charEnd == ann.charEnd
                        and trigger.text == ann.text
                        and trigger.type == ann.type
                    ):
                        found = True
                        ann = trigger
                        break
                if not found:
                    stDoc.triggers.append(ann)
            assert entity.get("id") != None
            tMap[entity.get("id")] = ann
            if entity.get("type") == "Process":  # these can have 0 interactions
                event = Annotation()
                event.trigger = ann
                event.type = event.trigger.type
                eMap[entity.get("id")] = event
                if entityElementMap[entity.get("id")].get("speculation") == "True":
                    event.speculation = True
                if entityElementMap[entity.get("id")].get("negation") == "True":
                    event.negation = True
                stDoc.events.append(event)
            # Add confidence scores
            ann.triggerScores = entity.get("predictions")
            ann.unmergingScores = entity.get("umStrength")
            ann.speculationScores = entity.get("modPred")
            ann.negationScores = entity.get("modPred")
        # First map Coref proteins
        corefProtMap = {}
        for interaction in document.getiterator("interaction"):
            intType = interaction.get("type")
            if intType == "Target":
                e1 = interaction.get("e1")
                e2 = interaction.get("e2")
                if not tMap.has_key(e2):
                    print >>sys.stderr, "Warning, no trigger for Coref Protein Target"
                    continue
                e2 = tMap[e2]
                if not corefProtMap.has_key(e1):
                    corefProtMap[e1] = []
                if not e2 in corefProtMap[e1]:
                    corefProtMap[e1].append(e2)
        # Then process all interactions
        for interaction in document.getiterator("interaction"):
            intType = interaction.get("type")
            if intType == "neg" or intType == "Target":
                continue  # Targets have already been put into a dictionary
            # elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation"]:
            # elif intType in ["Site", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Phosphorylation", "Positive_regulation", "Negative_regulation", "Regulation",
            #                 "InputAssociation", "InputProcess", "InputInhibitor", "OutputProcess"]:
            if "/" in intType and "(" in intType:  # BI-task
                eventType, argTypes = intType.split("(")
                arg1Type, arg2Type = argTypes[:-1].split("/")
                event = Annotation()
                event.trigger = None  # triggerless event (same as relation)
                event.type = eventType
                event.arguments.append([arg1Type, interaction.get("e1"), None])
                event.arguments.append([arg2Type, interaction.get("e2"), None])
                if event.arguments[0][0] == "SiteArg":  # convert back to actual sites
                    event.arguments[0][0] = "Site"
                if event.arguments[1][0] == "SiteArg":  # convert back to actual sites
                    event.arguments[1][0] = "Site"
                # event.speculation = entityElementMap[e1].get("speculation")
                # event.negation = entityElementMap[e1].get("negation")
                stDoc.events.append(event)
            elif intType not in [
                "Protein-Component",
                "Subunit-Complex",
                "Renaming",
                "Coref",
                "SR-subunitof",
                "SR-equivto",
                "SR-partof",
                "SR-memberof",
            ]:
                # if intType == "Site" and tMap[interaction.get("e1")].type == "Entity":
                if intType == "Site":
                    # These sites are real sites (i.e. task 2 sites).
                    # Other sites are just arguments called "site"
                    # sites.append(interaction)
                    siteMap[interaction.get("e2")] = tMap[interaction.get("e1")]
                    siteScores[interaction.get("e2")] = interaction.get("predictions")
                else:
                    e1 = interaction.get("e1")
                    if eMap.has_key(e1):  # event has already been created
                        event = eMap[e1]  # eMap lists events by their trigger ids
                    else:
                        eventType = tMap[interaction.get("e1")].type
                        if eventType != "Entity":  # "Entity"-type entities are never event roots
                            event = Annotation()
                            event.trigger = tMap[interaction.get("e1")]
                            event.type = event.trigger.type
                            if hasattr(event.trigger, "eventId"):
                                event.id = event.trigger.eventId
                            eMap[e1] = event
                            if entityElementMap[e1].get("speculation") == "True":
                                event.speculation = True
                            if entityElementMap[e1].get("negation") == "True":
                                event.negation = True
                            stDoc.events.append(event)
                        else:
                            event = None
                    if event != None:
                        arg = [interaction.get("type"), interaction.get("e2"), None, interaction.get("predictions")]
                        if arg[0] == "SiteArg":  # convert back to actual sites
                            arg[0] = "Site"
                            if arg[3] != None:  # Convert also prediction strengths
                                arg[3] = arg[3].replace("SiteArg", "Site")
                        event.arguments.append(arg)
            else:  # interaction is a relation
                rel = Annotation()
                rel.type = interaction.get("type")
                e1 = interaction.get("e1")
                e2 = interaction.get("e2")
                relScores = interaction.get("predictions")
                # assert rel.type == "Protein-Component" or rel.type == "Subunit-Complex" or rel.type == "Renaming", (rel.type, stDoc.id, interaction.get("id"))
                if rel.type == "Protein-Component" or rel.type == "Subunit-Complex":
                    rel.arguments.append(["Arg1", tMap[e1], None, relScores])
                    rel.arguments.append(["Arg2", tMap[e2], None, relScores])
                elif rel.type == "Renaming":
                    rel.arguments.append(["Former", tMap[e1], None, relScores])
                    rel.arguments.append(["New", tMap[e2], None, relScores])
                elif rel.type == "Coref":
                    rel.arguments.append(["Anaphora", tMap[e1], None, relScores])
                    rel.arguments.append(["Antecedent", tMap[e2], None, relScores])
                    # Add protein arguments'
                    if corefProtMap.has_key(e2):
                        for prot in corefProtMap[e2]:
                            rel.arguments.append(["Target", prot, None])
                elif rel.type.startswith("SR-"):
                    rel.arguments.append(["Arg1", tMap[e1], None, relScores])
                    rel.arguments.append(["Arg2", tMap[e2], None, relScores])
                else:
                    assert False, (rel.type, stDoc.id, interaction.get("id"))
                stDoc.relations.append(rel)
        # Map argument targets
        for event in stDoc.events:
            for arg in event.arguments[:]:
                if arg[1] == None:
                    assert False
                    continue
                id = arg[1]
                if eMap.has_key(id):
                    arg[1] = eMap[id]
                elif tMap.has_key(id):
                    arg[1] = tMap[id]
                    ## Remove Entity-type triggers if they are Regulation-arguments
                    # if "egulation" in event.type and tMap[id].type != "Protein":
                    #    event.arguments.remove(arg)
                # add sites
                if siteMap.has_key(id):
                    if siteMap[id].type == "Entity":
                        assert id not in eMap
                        assert id in tMap
                        arg[2] = siteMap[id]
                        if id in siteScores and siteScores[id] != None:
                            while len(arg) < 5:
                                arg += [None]
                            assert arg[4] == None
                            arg[4] = siteScores[id]
                    else:
                        nonEntitySiteCount += 1
                    # assert siteMap[id].type == "Entity", (stDoc.id, event.id, id, siteMap[id].id, siteMap[id].type)
    #        # Remove eventless triggers
    #        triggersToKeep = []
    #        for trigger in stDoc.triggers:
    #            if trigger.type == "Entity":
    #                triggersToKeep.append(trigger)
    #            else:
    #                for event in stDoc.events:
    #                    if event.trigger == trigger:
    #                        triggersToKeep.append(trigger)
    #                        break
    #        stDoc.triggers = triggersToKeep
    # Sort arguments
    # for eKey in sorted(eMap.keys()):
    #    event = eMap[eKey]
    #    event.arguments.sort(cmp=compareArguments)
    # Create STFormat ids
    # updateIds(stDoc.proteins)
    # updateIds(stDoc.triggers, getMaxId(stDoc.proteins) + 1)
    # updateIds(stDoc.events)
    # updateIds(stDoc.relations)

    if nonEntitySiteCount > 0:
        print >>sys.stderr, "Warning, discarded", nonEntitySiteCount, "non-entity sites"

    if output != None:
        print >>sys.stderr, "Writing output to", output
        writeSet(
            documents,
            output,
            resultFileTag=outputTag,
            debug=debug,
            task=task,
            validate=validate,
            writeScores=writeScores,
        )
    return documents
Exemplo n.º 56
0
    def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path):
        features = {} 
        if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55
            self.triggerFeatureBuilder.setFeatureVector(features)
            self.triggerFeatureBuilder.tag = "trg1_"
            self.triggerFeatureBuilder.buildFeatures(token1)
            self.triggerFeatureBuilder.tag = "trg2_"
            self.triggerFeatureBuilder.buildFeatures(token2)
            self.triggerFeatureBuilder.setFeatureVector(None)
        # REL features
        if self.styles["rel_features"] and not self.styles["no_task"]:
            self.relFeatureBuilder.setFeatureVector(features)
            self.relFeatureBuilder.tag = "rel1_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1))
            self.relFeatureBuilder.tag = "rel2_"
            self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2))
            self.relFeatureBuilder.setFeatureVector(None)
        if self.styles["bacteria_renaming"] and not self.styles["no_task"]:
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(features)
            self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2)
            #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41
            self.bacteriaRenamingFeatureBuilder.setFeatureVector(None)
        if self.styles["co_features"] and not self.styles["no_task"]:
            e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset"))
            e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset"))
            if Range.contains(e1Offset, e2Offset):
                features[self.featureSet.getId("e1_contains_e2")] = 1
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("e1_contains_e2name")] = 1
            if Range.contains(e2Offset, e1Offset):
                features[self.featureSet.getId("e2_contains_e1")] = 1
                if entity1.get("given") == "True":
                    features[self.featureSet.getId("e2_contains_e1name")] = 1
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder.setFeatureVector(features)
            self.drugFeatureBuilder.tag = "ddi_"
            self.drugFeatureBuilder.buildPairFeatures(entity1, entity2)  
            if self.styles["ddi_mtmx"]:
                self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2)
            self.drugFeatureBuilder.setFeatureVector(None)
        if self.styles["graph_kernel"]:
            self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path)
            self.graphKernelFeatureBuilder.setFeatureVector(None)
        if self.styles["entity_type"]:
            e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1)
            e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2)
            features[self.featureSet.getId("e1_"+e1Type)] = 1
            features[self.featureSet.getId("e2_"+e2Type)] = 1
            features[self.featureSet.getId("distance_"+str(len(path)))] = 1
        if not self.styles["no_dependency"]:
            #print "Dep features"
            self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2)
            #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast
            if not self.styles["disable_entity_features"]:
                self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path)
            if not self.styles["disable_terminus_features"]:
                self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast
            if not self.styles["disable_single_element_features"]:
                self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph)
            if not self.styles["disable_ngram_features"]:
                #print "NGrams"
                self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast
                self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast
            #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast
            #if edges != None:
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast
            #    self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast
            if not self.styles["disable_path_edge_features"]:
                self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph)
            self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph)
            self.multiEdgeFeatureBuilder.setFeatureVector(None)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2)
            shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path)
            print shortestPaths
            if len(shortestPaths) > 0:
                self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph)
            self.nodalidaFeatureBuilder.setFeatureVector(None)
        if self.styles["linear_features"]:
            self.tokenFeatureBuilder.setFeatureVector(features)
            for i in range(len(sentenceGraph.tokens)):
                if sentenceGraph.tokens[i] == token1:
                    token1Index = i
                if sentenceGraph.tokens[i] == token2:
                    token2Index = i
            linearPreTag = "linfw_"
            if token1Index > token2Index: 
                token1Index, token2Index = token2Index, token1Index
                linearPreTag = "linrv_"
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1")
            self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2")
            # Before, middle, after
#                self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf")
#                self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw")
#                self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af")
            # before-middle, middle, middle-after
#                    self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2)
#                    self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2)
            self.tokenFeatureBuilder.setFeatureVector(None)
        if self.styles["random"]:
            self.randomFeatureBuilder.setFeatureVector(features)
            self.randomFeatureBuilder.buildRandomFeatures(100, 0.01)
            self.randomFeatureBuilder.setFeatureVector(None)
        if self.styles["genia_features"] and not self.styles["no_task"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            assert(entity1.get("given") in (None, "False"))
            if entity2.get("given") == "True":
                features[self.featureSet.getId("GENIA_target_protein")] = 1
            else:
                features[self.featureSet.getId("GENIA_nested_event")] = 1
            if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization
                if entity2.get("given") == "True":
                    features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1
                else:
                    features[self.featureSet.getId("GENIA_regulation_of_event")] = 1
        if self.styles["bi_features"]:
            # Make features based on entity types
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            e1SuperType = str(self.getBISuperType(e1Type))
            e2SuperType = str(self.getBISuperType(e2Type))
            features[self.featureSet.getId("BI_e1_"+e1Type)] = 1
            features[self.featureSet.getId("BI_e2_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1
            features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1
            features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1
            features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
        if self.styles["sdb_features"]:
            e1Type = entity1.get("type")
            e2Type = entity2.get("type")
            features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1
            features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1
            features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1
            if e1Type == e2Type:
                features[self.featureSet.getId("SDB_e1e2_equal")] = 1
                features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1
            e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type))
            e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type))
            for e1SuperType in e1SuperTypes:
                for e2SuperType in e2SuperTypes:
                    features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1
                    features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1
                    features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1
                    if e1SuperType == e2SuperType:
                        features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1
                        features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder.setFeatureVector(features)
            self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2)
            self.ontobiotopeFeatureBuilder.setFeatureVector(None)
        if self.styles["full_entities"]:
            e1Text = entity1.get("text").lower()
            e2Text = entity2.get("text").lower()
            features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1
            features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1
            for ep1 in e1Text.split():
                for ep2 in e2Text.split():
                    features[self.featureSet.getId("FULL_e1_"+ep1)] = 1
                    features[self.featureSet.getId("FULL_e2_"+ep2)] = 1
                    features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1
        if self.styles["evex"]:
            self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.evexFeatureBuilder.setFeatureVector(None)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2)
            self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordNetFeatureBuilder.buildPathFeatures(path)
            self.wordNetFeatureBuilder.setFeatureVector(None)
        if self.styles["wordvector"]:
            self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_")
            self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_")
            self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_")
            self.wordVectorFeatureBuilder.buildPathFeatures(path)
            self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2))
            self.wordVectorFeatureBuilder.setFeatureVector(None)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2)
            self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph)
            self.giulianoFeatureBuilder.setFeatureVector(None)
        
        return features
Exemplo n.º 57
0
def exportChemProtPredictions(xml,
                              outPath,
                              fileTypes="predictions",
                              setNames=None):
    if fileTypes == "all":
        fileTypes = ["predictions", "abstracts", "entities", "relations"]
    elif isinstance(fileTypes, basestring):
        fileTypes = fileTypes.split(",")
    for fileType in fileTypes:
        if fileType not in [
                "predictions", "abstracts", "entities", "relations"
        ]:
            raise Exception("Unknown ChemProt file type '" + str(fileType) +
                            "'")
    xml = ETUtils.ETFromObj(xml)
    #with open(outPath, "wt") as f
    outFiles = {}
    openFiles = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        setName = document.get("set")
        if setNames != None:
            setName = setNames.get(setName, setName)
        if setName not in outFiles:
            outFiles[setName] = {}
        outFile = openOutFile(setName, outPath, "abstracts", fileTypes,
                              outFiles, openFiles)
        if outFile != None:
            docText = document.get("text")
            #assert docText.count("\t") == 1, (docText.count("\t"), document.attrib)
            #title, abstract = docText.split("\t")
            #titleLength = document.get("titleLength")
            titleOffset = Range.charOffsetToSingleTuple(
                document.get("titleOffset"))
            assert titleOffset[0] == 0
            outFile.write("\t".join([
                docId, docText[:titleOffset[1]], docText[titleOffset[1] + 1:]
            ]) + "\n")
        entityById = {}
        for entity in document.getiterator("entity"):
            outFile = openOutFile(setName, outPath, "entities", fileTypes,
                                  outFiles, openFiles)
            if outFile != None:
                eType = entity.get("type")
                if entity.get("normalized") != None and entity.get(
                        "type") == "GENE":
                    eType += "-Y" if entity.get(
                        "normalized") == "True" else "-N"
                offset = Range.charOffsetToSingleTuple(
                    entity.get("charOffset"))
                outFile.write("\t".join([
                    docId,
                    entity.get("origId"), eType,
                    str(offset[0]),
                    str(offset[1]),
                    entity.get("text")
                ]) + "\n")
            assert entity.get("id") not in entityById
            entityById[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entityById[interaction.get("e1")]
            e2 = entityById[interaction.get("e2")]
            outFile = openOutFile(setName, outPath, "relations", fileTypes,
                                  outFiles, openFiles)
            if outFile != None:
                evaluated = "X"
                if interaction.get("evaluated") != None:
                    evaluated = "Y " if interaction.get(
                        "evaluated") == "True" else "N "
                outFile.write("\t".join([
                    docId,
                    interaction.get("type"), evaluated,
                    interaction.get("relType"), "Arg1:" +
                    e1.get("origId"), "Arg2:" + e2.get("origId")
                ]) + "\n")
            outFile = openOutFile(setName, outPath, "predictions", fileTypes,
                                  outFiles, openFiles)
            if outFile != None:
                outFile.write("\t".join([
                    docId,
                    interaction.get("type"), "Arg1:" +
                    e1.get("origId"), "Arg2:" + e2.get("origId")
                ]) + "\n")
    print >> sys.stderr, "Closing output files"
    for f in openFiles.values():
        f.close()
    return xml
Exemplo n.º 58
0
def orderTokens(token1, token2):
    offset1 = Range.charOffsetToSingleTuple(token1.get("charOffset"))
    offset2 = Range.charOffsetToSingleTuple(token1.get("charOffset"))
    return Range.order(offset1, offset2)
Exemplo n.º 59
0
def mergeSentences(input, output, verbose=False):
    print >> sys.stderr, "Merging sentences into documents"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        counts["documents"] += 1
        # Check that the entity has only sentence elements as children
        children = [x for x in document]
        docChildTypes = sorted(set([x.tag for x in children]))
        if len(docChildTypes) == 0:
            counts["documents-with-no-sentences"] += 1
            continue
        elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence":
            raise Exception("Document '" + str(document.get("id")) +
                            "' has non-sentence children: " +
                            str(docChildTypes))
        # Process all the child sentence elements
        docId = document.get("id")
        interactions = []
        entities = []
        entityById = {}
        interactionById = {}
        combinedText = ""
        calculatedOffset = (0, 0)
        for sentence in children:
            document.remove(sentence)
            sentenceText = sentence.get("head", "") + sentence.get(
                "text", "") + sentence.get("tail", "")
            sentOffset = sentence.get("charOffset")
            if sentence == children[0]:
                noDefinedOffsets = sentOffset == None
            elif (sentOffset == None) != noDefinedOffsets:
                raise Exception("Only some sentences in document '" + docId +
                                "' have defined offsets")
            if sentOffset == None:
                if sentence != children[-1]:
                    sentenceText = sentenceText + " "
                calculatedOffset = (calculatedOffset[1],
                                    calculatedOffset[1] + len(sentenceText))
                sentOffset = calculatedOffset
            else:
                sentOffset = Range.charOffsetToSingleTuple(sentOffset)
            combinedText += sentenceText
            # Collect and update the entity elements
            for entity in sentence.findall("entity"):
                # Map sentence-level entity offsets to document level
                for offsetKey in ("charOffset", "headOffset"):
                    if entity.get(offsetKey) != None:
                        offset = Range.charOffsetToTuples(
                            entity.get(offsetKey))
                        for i in range(len(offset)):
                            offset[i] = (offset[i][0] + sentOffset[0],
                                         offset[i][1] + sentOffset[0])
                        entity.set(offsetKey, Range.tuplesToCharOffset(offset))
                # Compare mapped offsets to origOffset, if available
                if entity.get("origOffset") != None:
                    if entity.get("charOffset") != entity.get("origOffset"):
                        raise Exception(
                            "Document '" + str(document.get("id")) +
                            "' entity '" + str(entity.get("id")) +
                            "' new charOffset differs from origOffset: " +
                            str([
                                entity.get("charOffset"),
                                entity.get("origOffset")
                            ]))
                    counts["checked-origOffsets"] += 1
                    del entity.attrib["origOffset"]
                assert entity.get("id") not in entityById
                entityById[entity.get(
                    "id"
                )] = entity  # For re-mapping the interaction 'e1' and 'e2' attributes
                entities.append(entity)
                counts["moved-entities"] += 1
            # Collect and update the interaction elements
            for interaction in sentence.findall("interaction"):
                assert interaction.get("id") not in interactionById
                interactionById[interaction.get(
                    "id"
                )] = interaction  # For re-mapping the interaction 'siteOf' attributes
                interactions.append(interaction)
                counts["moved-interactions"] += 1
        # Check that the combined sentence text matches the document text, if available
        if document.get("text") != None and document.get(
                "text") != combinedText:
            if combinedText == document.get(
                    "text")[0:len(combinedText)] and document.get(
                        "text")[len(combinedText):].strip() == "":
                if verbose:
                    print >> sys.stderr, "Warning, document '" + document.get(
                        "id"
                    ) + "' text has trailing whitespace not included in the combined sentence text"
                combinedText = document.get("text")
                counts["missing-trailing-whitespace"] += 1
            else:
                raise Exception(
                    "Document '" + str(document.get("id")) +
                    "' text differs from combined sentence text: " +
                    str([document.get("text"), combinedText]))
            counts["checked-document-texts"] += 1
        # Check that the entities' texts match the document text
        for entity in entities:
            offset = Range.charOffsetToTuples(entity.get("charOffset"))
            if len(offset) == 1:  # Compare only continous entities
                if not Range.contains((0, len(combinedText)), offset[0]):
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' offset is not contained in combined sentence text: "
                        + str([
                            entity.attrib, offset, [0, len(combinedText)],
                            combinedText
                        ]))
                combTextSpan = combinedText[offset[0][0]:offset[0][1]]
                if entity.get("text") != combTextSpan:
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' text does not match combined sentence text: " +
                        str([entity.get("text"), combTextSpan]))
                counts["checked-charOffsets"] += 1
        # Set the combined text as the document text
        document.set("text", combinedText)
        # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping)
        for i in range(len(entities)):
            entities[i].set("id", docId + ".e" +
                            str(i))  # Update the id for the document level
        for i in range(len(interactions)):
            interaction.set("id", docId + ".i" +
                            str(i))  # Update the id for the document level
        # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences)
        for i in range(len(interactions)):
            interaction = interactions[i]
            for entKey in ("e1", "e2"):
                interaction.set(entKey,
                                entityById[interaction.get(entKey)].get("id"))
            if interaction.get("siteOf") != None:
                interaction.set(
                    "siteOf",
                    interactionById[interaction.get("siteOf")].get("id"))
        # Add the entity and interaction elements to the document
        document.extend(entities)
        document.extend(interactions)
    print >> sys.stderr, "Counts:", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree