Exemplo n.º 1
0
def makeEntityElement(ann, idCount, docEl):
    entEl = ET.Element("entity")
    entEl.set("type", ann.type)
    entEl.set("text", ann.text)
    # identifiers
    protId = docEl.get("id") + ".e" + str(idCount)
    entEl.set("id", protId)
    if ann.id != None:
        entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
    # offsets
    entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
    if len(ann.alternativeOffsets) > 0:
        altOffs = []
        for alternativeOffset in ann.alternativeOffsets:
            altOffs.append(
                str(alternativeOffset[0]) + "-" +
                str(alternativeOffset[1] - 1))
        entEl.set("altOffset", ",".join(altOffs))
    if ann.normalization != None:
        entEl.set("normalization", ann.normalization)
    addExtraToElement(entEl, ann.extra)
    # determine if given data
    assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
    if ann.fileType == "a1":  #protein.isName():
        entEl.set("given", "True")
    #else:
    #    entEl.set("given", "False")
    return entEl
Exemplo n.º 2
0
def addParseElements(doc, docEl):
    if docEl.tag != "sentence":
        return
    sentAnalysesEl = ET.SubElement(docEl, "analyses")
    #parsesEl = ET.SubElement(sentAnalysesEl, "parses")
    parseEl = ET.SubElement(sentAnalysesEl, "parse")
    #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
    tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
    parseEl.set("parser", "gold")
    parseEl.set("tokenizer", "gold")
    tokenizationEl.set("tokenizer", "gold")
    tokenMap = {}
    for word in doc.words:
        tokEl = ET.SubElement(tokenizationEl, "token")
        tokEl.set("id", word.id)
        tokEl.set("text", word.text)
        tokEl.set("POS", "None")
        assert len(word.charOffsets) == 1, (word, word.charOffsets)
        tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
        tokenMap[word.id] = tokEl
    for dep in doc.dependencies:
        depEl = ET.SubElement(parseEl, "dependency")
        depEl.set("id", dep.id)
        depEl.set("type", dep.type)
        assert len(dep.arguments) == 2
        depEl.set("t1", dep.arguments[0].target.id)
        depEl.set("t2", dep.arguments[1].target.id)
        if dep.type.find(":") != -1:
            word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split(
                "-")
            tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
            tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
Exemplo n.º 3
0
def makeEntityElement(ann, idCount, docEl):
    entEl = ET.Element("entity")
    entEl.set("type", ann.type)
    entEl.set("text", ann.text)
    # identifiers
    protId = docEl.get("id") + ".e" + str(idCount)
    entEl.set("id", protId)
    if ann.id != None:
        entEl.set("origId", docEl.get("origId") + "." + str(ann.id))
    # offsets
    entEl.set("charOffset", Range.tuplesToCharOffset(ann.charOffsets))
    if len(ann.alternativeOffsets) > 0:
        altOffs = []
        for alternativeOffset in ann.alternativeOffsets:
            altOffs.append( str(alternativeOffset[0]) + "-" + str(alternativeOffset[1]-1) ) 
        entEl.set("altOffset", ",".join(altOffs))
    if ann.normalization != None:
        entEl.set("normalization", ann.normalization)
    addExtraToElement(entEl, ann.extra)
    # determine if given data
    assert ann.fileType in ["a1", "a2", "rel"], ann.fileType
    if ann.fileType == "a1": #protein.isName():
        entEl.set("given", "True")
    #else:
    #    entEl.set("given", "False")
    return entEl
Exemplo n.º 4
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 5
0
def addParseElements(doc, docEl):
    if docEl.tag != "sentence":
        return
    sentAnalysesEl = ET.SubElement(docEl, "analyses")
    #parsesEl = ET.SubElement(sentAnalysesEl, "parses")
    parseEl = ET.SubElement(sentAnalysesEl, "parse")
    #tokenizationsEl = ET.SubElement(sentAnalysesEl, "tokenizations")
    tokenizationEl = ET.SubElement(sentAnalysesEl, "tokenization")
    parseEl.set("parser", "gold")
    parseEl.set("tokenizer", "gold")
    tokenizationEl.set("tokenizer", "gold")
    tokenMap = {}
    for word in doc.words:
        tokEl = ET.SubElement(tokenizationEl, "token")
        tokEl.set("id", word.id)
        tokEl.set("text", word.text)
        tokEl.set("POS", "None")
        assert len(word.charOffsets) == 1, (word, word.charOffsets)
        tokEl.set("charOffset", Range.tuplesToCharOffset(word.charOffsets))
        tokenMap[word.id] = tokEl
    for dep in doc.dependencies:
        depEl = ET.SubElement(parseEl, "dependency")
        depEl.set("id", dep.id)
        depEl.set("type", dep.type)
        assert len(dep.arguments) == 2
        depEl.set("t1", dep.arguments[0].target.id)
        depEl.set("t2", dep.arguments[1].target.id)
        if dep.type.find(":") != -1:
            word1Type, word2Type = dep.type.split("(")[0].split(":")[-1].split("-")
            tokenMap[dep.arguments[0].target.id].set("POS", word1Type)
            tokenMap[dep.arguments[1].target.id].set("POS", word2Type)
Exemplo n.º 6
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    # Move interactions
    intCount = 0
    for interaction in document.findall("interaction"):
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        interaction.set("id", targetSentence.get("id") + ".i" + str(intCount))
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
Exemplo n.º 7
0
def updateXML(root, removeAnalyses=True):
    counts = defaultdict(int)
    for document in root.findall("document"):
        sentencePos = 0
        counts["documents"] += 1
        for sentence in document.findall("sentence"):
            counts["sentences"] += 1
            # Remove the original parses
            analyses = sentence.find("sentenceanalyses")
            if analyses != None:
                counts["analyses"] += 1
                if removeAnalyses:
                    counts["removed-analyses"] += 1
                    sentence.remove(analyses)
            # Add an artifical sentence offset so that sentences can be exported as a single document
            sentenceText = sentence.get("text")
            sentence.set("charOffset", Range.tuplesToCharOffset((sentencePos, sentencePos + len(sentenceText))))
            # Update the character offsets of all entities from the old format (begin,end) to the new one (begin,end+1)
            for entity in sentence.findall("entity"):
                counts["entities"] += 1
                offsets = [(x[0], x[1] + 1) for x in Range.charOffsetToTuples(entity.get("charOffset"))]
                entityText = entity.get("text")
                for offset, entitySpan in zip(offsets, [sentenceText[x[0]:x[1]] for x in offsets]):
                    counts["entity-offsets"] += 1
                    lenOffset = offset[1] - offset[0]
                    offsetText, entityText = entityText[:lenOffset].strip(), entityText[lenOffset:].strip()
                    assert offsetText == entitySpan, (offsets, (entity.get("text"), entitySpan), (offsetText, entityText), sentenceText)
                entity.set("charOffset", Range.tuplesToCharOffset(offsets))
            # Convert positive pairs into interaction elements
            numInteractions = 0
            for pair in sentence.findall("pair"):
                counts["pairs"] += 1
                sentence.remove(pair)
                if pair.get("interaction") == "True":
                    del pair.attrib["interaction"]
                    pair.set("id", pair.get("id").rsplit(".", 1)[0] + ".i" + str(numInteractions))
                    pair.set("type", "PPI")
                    ET.SubElement(sentence, "interaction", pair.attrib)
                    numInteractions += 1
                    counts["interactions"] += 1
            sentencePos += len(sentenceText) + 1
    print >> sys.stderr, "Updated Interaction XML format:", dict(counts)
    return root
Exemplo n.º 8
0
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"), rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append( (charOffset[0], charOffset[1]+1) )
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
Exemplo n.º 9
0
def processElements(xml):
    for ddi in xml.getiterator("ddi"):
        ddi.tag = "interaction"
    for entity in xml.getiterator("entity"):
        entity.set("given", "True")
        # Reformat disjoint character offsets and update character range format for TEES 2.0+
        charOffsets = Range.charOffsetToTuples(entity.get("charOffset"),
                                               rangeSep=";")
        updatedCharOffsets = []
        for charOffset in charOffsets:
            updatedCharOffsets.append((charOffset[0], charOffset[1] + 1))
        entity.set("charOffset", Range.tuplesToCharOffset(updatedCharOffsets))
Exemplo n.º 10
0
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff"+str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1,-2,-3]:
                        entIndex = sText.find(entity.get("text"), charOffset[0]+i)
                        if entIndex != -1:
                            break
                if entIndex != 0: # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0,-1,-2,-3]:
                        lowerEntIndex = sTextLower.find(entity.get("text"), charOffset[0]+i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(lowerEntIndex - charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"), entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff"+str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText, entity.get("text"), entity.get("id"))
                # move offset       
                charOffset = (charOffset[0]+indexDiff, charOffset[1]+indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]]
                assert sEntity == entity.get("text") or sEntity.lower() == entity.get("text"), (charOffset, sText, entity.get("text"), entity.get("id"))
                entity.set("charOffset", Range.tuplesToCharOffset( (charOffset[0], charOffset[1])))
                entity.set("given", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
Exemplo n.º 11
0
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(
                        entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1] - 1)
                    outFile.write(
                        "|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda: defaultdict(lambda: None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get(
                        "given") != "True":
                    intMap[interaction.get("e1")][interaction.get(
                        "e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get(
                        "e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities) - 1):
                for j in range(i + 1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
Exemplo n.º 12
0
def makeDDI13SubmissionFile(input, output, mode="interactions", idfilter=None):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        sentenceId = sentence.get("id")
        if idfilter != None and idfilter not in sentenceId:
            continue
        # Output entities
        if mode == "entities":
            for entity in sentence.findall("entity"):
                if entity.get("type") != "neg":
                    outFile.write(sentenceId)
                    offsets = Range.charOffsetToTuples(entity.get("charOffset"))
                    for i in range(len(offsets)):
                        offsets[i] = (offsets[i][0], offsets[i][1]-1)
                    outFile.write("|" + Range.tuplesToCharOffset(offsets, rangeSep=";"))
                    outFile.write("|" + entity.get("text"))
                    outFile.write("|" + entity.get("type"))
                    outFile.write("\n")    
        if mode == "interactions":
            # First determine which pairs interact
            intMap = defaultdict(lambda:defaultdict(lambda:None))
            for interaction in sentence.findall("interaction"):
                # Make mapping both ways to discard edge directionality. This isn't actually needed,
                # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
                # but shouldn't harm to include it and now it works regardless of pair direction.
                if interaction.get("type") != "neg" and interaction.get("given") != "True":
                    intMap[interaction.get("e1")][interaction.get("e2")] = interaction
                    intMap[interaction.get("e2")][interaction.get("e1")] = interaction
            # Then write all pairs to the output file
            entities = sentence.findall("entity")
            for i in range(0, len(entities)-1):
                for j in range(i+1, len(entities)):
                    eIId = entities[i].get("id")
                    eJId = entities[j].get("id")
                    outFile.write(sentenceId + "|" + eIId + "|" + eJId + "|")
                    if intMap[eIId][eJId] != None:
                        interaction = intMap[eIId][eJId]
                        assert interaction.get("type") != "neg"
                        outFile.write("1|" + interaction.get("type") + "\n")
                    else:
                        outFile.write("0|null\n")
    outFile.close()
Exemplo n.º 13
0
def convert(metamapEl, sentenceEl):
    """
    Convert MetaMap XML into phrase-elements
    """
    newMetamapEl = ET.Element("metamap") # make a new metamap element
    utteranceCount = 0
    for utterance in metamapEl.getiterator("Utterance"): # process all utterances (sentences)
        utteranceCount += 1
        #print "UT:", utterance.find("UttText").text
        uttOffsetBegin = int(utterance.find("UttStartPos").text)
        for phrase in utterance.getiterator("Phrase"): # process all phrases for each utterance
            #print "Phrase:", phrase.find("PhraseText").text
            phraseEl = ET.Element("phrase")
            phraseOffset = [int(phrase.find("PhraseStartPos").text), int(phrase.find("PhraseStartPos").text) + int(phrase.find("PhraseLength").text)]
            phraseOffset = [phraseOffset[0] - uttOffsetBegin, phraseOffset[1] - uttOffsetBegin]
            phraseEl.set("charOffset", Range.tuplesToCharOffset(phraseOffset))
            phraseEl.set("text", phrase.find("PhraseText").text)
            for candidate in phrase.getiterator("Candidate"): # process first candidate of each phrase
                phraseEl.set("score", candidate.find("CandidateScore").text)
                phraseEl.set("cui", candidate.find("CandidateCUI").text)
                phraseEl.set("matched", candidate.find("CandidateMatched").text)
                phraseEl.set("preferred", candidate.find("CandidatePreferred").text)
                semTypes = set()
                for semType in candidate.getiterator("SemType"):
                    semTypes.add(semType.text)
                phraseEl.set("semTypes", ",".join(sorted(list(semTypes))))
                sources = set()
                for source in candidate.getiterator("Source"):
                    sources.add(source.text)
                phraseEl.set("sources", ",".join(sorted(list(sources))))
                break
            if phraseEl.get("matched") != None: # include only matched phrases as new elements
                newMetamapEl.append(phraseEl)
            #print ET.tostring(phraseEl, "utf-8")
    
    if utteranceCount > 1:
        print >> sys.stderr, "Warning, sentence", sentenceEl.get("id"), "has", utteranceCount, "utterances"
    return newMetamapEl
Exemplo n.º 14
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(
            1, "Fixing AltOffsets for sentence (" + sentence.get("id") + "): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i]
                altOffsets[i] = (altOffset[0] - sentOffset[0],
                                 altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1

    print >> sys.stderr, "Fixed", fixCount, "altOffsets"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 15
0
def mergeSentences(input, output, verbose=False):
    print >> sys.stderr, "Merging sentences into documents"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    counts = defaultdict(int)
    for document in corpusRoot.findall("document"):
        counts["documents"] += 1
        # Check that the entity has only sentence elements as children
        children = [x for x in document]
        docChildTypes = sorted(set([x.tag for x in children]))
        if len(docChildTypes) == 0:
            counts["documents-with-no-sentences"] += 1
            continue
        elif len(docChildTypes) > 1 or docChildTypes[0] != "sentence":
            raise Exception("Document '" + str(document.get("id")) +
                            "' has non-sentence children: " +
                            str(docChildTypes))
        # Process all the child sentence elements
        docId = document.get("id")
        interactions = []
        entities = []
        entityById = {}
        interactionById = {}
        combinedText = ""
        calculatedOffset = (0, 0)
        for sentence in children:
            document.remove(sentence)
            sentenceText = sentence.get("head", "") + sentence.get(
                "text", "") + sentence.get("tail", "")
            sentOffset = sentence.get("charOffset")
            if sentence == children[0]:
                noDefinedOffsets = sentOffset == None
            elif (sentOffset == None) != noDefinedOffsets:
                raise Exception("Only some sentences in document '" + docId +
                                "' have defined offsets")
            if sentOffset == None:
                if sentence != children[-1]:
                    sentenceText = sentenceText + " "
                calculatedOffset = (calculatedOffset[1],
                                    calculatedOffset[1] + len(sentenceText))
                sentOffset = calculatedOffset
            else:
                sentOffset = Range.charOffsetToSingleTuple(sentOffset)
            combinedText += sentenceText
            # Collect and update the entity elements
            for entity in sentence.findall("entity"):
                # Map sentence-level entity offsets to document level
                for offsetKey in ("charOffset", "headOffset"):
                    if entity.get(offsetKey) != None:
                        offset = Range.charOffsetToTuples(
                            entity.get(offsetKey))
                        for i in range(len(offset)):
                            offset[i] = (offset[i][0] + sentOffset[0],
                                         offset[i][1] + sentOffset[0])
                        entity.set(offsetKey, Range.tuplesToCharOffset(offset))
                # Compare mapped offsets to origOffset, if available
                if entity.get("origOffset") != None:
                    if entity.get("charOffset") != entity.get("origOffset"):
                        raise Exception(
                            "Document '" + str(document.get("id")) +
                            "' entity '" + str(entity.get("id")) +
                            "' new charOffset differs from origOffset: " +
                            str([
                                entity.get("charOffset"),
                                entity.get("origOffset")
                            ]))
                    counts["checked-origOffsets"] += 1
                    del entity.attrib["origOffset"]
                assert entity.get("id") not in entityById
                entityById[entity.get(
                    "id"
                )] = entity  # For re-mapping the interaction 'e1' and 'e2' attributes
                entities.append(entity)
                counts["moved-entities"] += 1
            # Collect and update the interaction elements
            for interaction in sentence.findall("interaction"):
                assert interaction.get("id") not in interactionById
                interactionById[interaction.get(
                    "id"
                )] = interaction  # For re-mapping the interaction 'siteOf' attributes
                interactions.append(interaction)
                counts["moved-interactions"] += 1
        # Check that the combined sentence text matches the document text, if available
        if document.get("text") != None and document.get(
                "text") != combinedText:
            if combinedText == document.get(
                    "text")[0:len(combinedText)] and document.get(
                        "text")[len(combinedText):].strip() == "":
                if verbose:
                    print >> sys.stderr, "Warning, document '" + document.get(
                        "id"
                    ) + "' text has trailing whitespace not included in the combined sentence text"
                combinedText = document.get("text")
                counts["missing-trailing-whitespace"] += 1
            else:
                raise Exception(
                    "Document '" + str(document.get("id")) +
                    "' text differs from combined sentence text: " +
                    str([document.get("text"), combinedText]))
            counts["checked-document-texts"] += 1
        # Check that the entities' texts match the document text
        for entity in entities:
            offset = Range.charOffsetToTuples(entity.get("charOffset"))
            if len(offset) == 1:  # Compare only continous entities
                if not Range.contains((0, len(combinedText)), offset[0]):
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' offset is not contained in combined sentence text: "
                        + str([
                            entity.attrib, offset, [0, len(combinedText)],
                            combinedText
                        ]))
                combTextSpan = combinedText[offset[0][0]:offset[0][1]]
                if entity.get("text") != combTextSpan:
                    raise Exception(
                        "Document '" + str(document.get("id")) + "' entity '" +
                        str(entity.get("id")) +
                        "' text does not match combined sentence text: " +
                        str([entity.get("text"), combTextSpan]))
                counts["checked-charOffsets"] += 1
        # Set the combined text as the document text
        document.set("text", combinedText)
        # Update entity and interaction ids (not done earlier so that possible error messages will refer to original ids, also because of siteOf-remapping)
        for i in range(len(entities)):
            entities[i].set("id", docId + ".e" +
                            str(i))  # Update the id for the document level
        for i in range(len(interactions)):
            interaction.set("id", docId + ".i" +
                            str(i))  # Update the id for the document level
        # Update interaction e1 and e2 ids (cannot be done earlier because interactions may refer to entities from multiple sentences)
        for i in range(len(interactions)):
            interaction = interactions[i]
            for entKey in ("e1", "e2"):
                interaction.set(entKey,
                                entityById[interaction.get(entKey)].get("id"))
            if interaction.get("siteOf") != None:
                interaction.set(
                    "siteOf",
                    interactionById[interaction.get("siteOf")].get("id"))
        # Add the entity and interaction elements to the document
        document.extend(entities)
        document.extend(interactions)
    print >> sys.stderr, "Counts:", dict(counts)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemplo n.º 16
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id", sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get("id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib, entityOffsets, sentenceOffset)
                        newEntityOffsets.append( (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0]) )
                assert len(newEntityOffsets) > 0, (entity.attrib, entityOffsets, sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset", Range.tuplesToCharOffset(newEntityOffsets)) 
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception("Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]
        
        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]  
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf", interactionOldToNewId[interaction.get("siteOf")])
Exemplo n.º 17
0
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()
    
    bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens()
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        incorrectCount = 0
        sentenceText = sentence.get("text")
        tokens = tokenize(sentenceText)
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            if entity.get("type") not in entityTypes:
                continue
            headOffset = entity.get("headOffset")
            if headOffset == None:
                if verbose: print "WARNING, no head offset for entity", entity.get("id")
                headOffset = entity.get("charOffset")
            headOffset = Range.charOffsetToTuples(headOffset)[0]
            charOffset = entity.get("charOffset")
            assert charOffset != None, "WARNING, no head offset for entity " + str(entity.get("id"))
            charOffset = Range.charOffsetToTuples(charOffset)[0]
            tokPos = [0,0]
            tokIndex = None
            # find main token
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token) # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"), tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex-1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex+1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens, i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex+1:endIndex+1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if                 
            # Insert changed charOffset
            counts["entities"] += 1
            newOffset = tuple(tokPos)
            newOffsetString = Range.tuplesToCharOffset([newOffset])
            if verbose:
                print "Entity", entity.get("id"), 
                #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
                print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]], sentenceText[newOffset[0]:newOffset[1]]], 
                print [entity.get("charOffset"), entity.get("headOffset"), newOffsetString], "Sent:", len(sentence.get("text")),
            if newOffset != headOffset:
                counts["extended"] += 1
                if verbose: print "EXTENDED",
            if newOffset == charOffset:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                incorrectCount += 1
                if verbose: print "INCORRECT"
            entity.set("charOffset", newOffsetString)
            #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
            entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
        if incorrectCount > 0 and verbose:
            print "TOKENS:", "|".join(tokens)
            print "--------------------------------"
    if verbose:
        print counts
    
    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree                    
Exemplo n.º 18
0
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir, setName.lower())
                inDirs.append(downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source":corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE):
                document = ET.Element("document", {"id":corpusName + ".d" + str(counts["documents"]), "origId":row["id"], "set":dataSetId})
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get("origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get("origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(document, "entity", {"id":document.get("id") + ".e" + str(len([x for x in document.findall("entity")]))})
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set("normalized", "True" if row["type"].endswith("-Y") else "False")
                    entity.set("charOffset", Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row["docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f, delimiter="\t", fieldnames=["docId", "group", "groupEval", "type", "arg1", "arg2"], quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId + ":")
                        row["arg" + argId] = row["arg" + argId][5:]
                    document = docById[row["docId"]]
                    e1 = entityById[row["docId"]].get(row["arg1"])
                    e2 = entityById[row["docId"]].get(row["arg2"])
                    if e1 != None and e2 != None:
                        interaction = ET.SubElement(document, "interaction", {"id":document.get("id") + ".i" + str(len([x for x in document.findall("interaction")]))})
                        interaction.set("directed", "True")
                        interaction.set("type", row["group"])
                        interaction.set("relType", row["type"])
                        row["groupEval"] = row["groupEval"].strip()
                        assert row["groupEval"] in ("Y", "N")
                        interaction.set("evaluated", "True" if row["groupEval"] == "Y" else "False")
                        interaction.set("e1", e1.get("id"))
                        interaction.set("e2", e2.get("id"))
                        counts["interactions"] += 1
                    else:
                        counts["interaction-error"] += 1
                        docsWithErrors.add(row["docId"])
        else:
            print >> sys.stderr, "No relations for dataset", dataSetId
        print >> sys.stderr, "dataset", dataSetId, {x:counts[x] - prevCounts.get(x, 0) for x in counts if counts[x] - prevCounts.get(x, 0) > 0}
    if len(docsWithErrors) > 0:
        counts["documents-with-errors"] = len(docsWithErrors)
    print >> sys.stderr, "---", "All Datasets Done", "---"
    print >> sys.stderr, "ChemProt conversion:", dict(counts)
    if tempDir != None and not debug:
        print >> sys.stderr, "Removing temporary directory", tempDir
        shutil.rmtree(tempDir)
    if outPath != None:
        ETUtils.write(corpus, outPath)
    return ET.ElementTree(corpus)
Exemplo n.º 19
0
def extend(input, output=None, entityTypes=["Bacterium"], verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    bacteriaTokens = ExampleBuilders.PhraseTriggerExampleBuilder.getBacteriaTokens(
    )

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        incorrectCount = 0
        sentenceText = sentence.get("text")
        tokens = tokenize(sentenceText)
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            if entity.get("type") not in entityTypes:
                continue
            headOffset = entity.get("headOffset")
            if headOffset == None:
                if verbose:
                    print "WARNING, no head offset for entity", entity.get(
                        "id")
                headOffset = entity.get("charOffset")
            headOffset = Range.charOffsetToTuples(headOffset)[0]
            charOffset = entity.get("charOffset")
            assert charOffset != None, "WARNING, no head offset for entity " + str(
                entity.get("id"))
            charOffset = Range.charOffsetToTuples(charOffset)[0]
            tokPos = [0, 0]
            tokIndex = None
            # find main token
            for i in range(len(tokens)):
                token = tokens[i]
                tokPos[1] = tokPos[0] + len(token)  # - 1
                if Range.overlap(headOffset, tokPos):
                    tokIndex = i
                    break
                tokPos[0] += len(token)
            assert tokIndex != None, (entity.get("id"), entity.get("text"),
                                      tokens)
            skip = False
            if tokPos[0] < headOffset[0]:
                tokPos = headOffset
                skip = True
            if not skip:
                # Extend before
                beginIndex = tokIndex
                for i in range(tokIndex - 1, -1, -1):
                    token = tokens[i]
                    if token.isspace():
                        continue
                    if not isBacteriaToken(token, bacteriaTokens,
                                           i - tokIndex):
                        beginIndex = i + 1
                        break
                    if i == 0:
                        beginIndex = i
                while tokens[beginIndex].isspace() or isExtraWord(
                        tokens[beginIndex], toLower=False):
                    beginIndex += 1
                    if beginIndex >= tokIndex:
                        beginIndex = tokIndex
                        break
                # Extend after
                endIndex = tokIndex
                if tokens[tokIndex][-1] != ",":
                    endIndex = tokIndex
                    for i in range(tokIndex + 1, len(tokens)):
                        token = tokens[i]
                        if token.isspace():
                            continue
                        if not isBacteriaToken(token, bacteriaTokens,
                                               i - tokIndex):
                            endIndex = i - 1
                            break
                        if i == len(tokens) - 1:
                            endIndex = i
                    while tokens[endIndex].isspace():
                        endIndex -= 1
                # Modify range
                if tokIndex > beginIndex:
                    for token in reversed(tokens[beginIndex:tokIndex]):
                        tokPos[0] -= len(token)
                if tokIndex < endIndex:
                    for token in tokens[tokIndex + 1:endIndex + 1]:
                        tokPos[1] += len(token)
                # Attempt to remove trailing periods and commas
                while not sentenceText[tokPos[1] - 1].isalnum():
                    tokPos[1] -= 1
                    if tokPos[1] < tokPos[0] + 1:
                        tokPos[1] = tokPos[0] + 1
                        break
                while not sentenceText[tokPos[0]].isalnum():
                    tokPos[0] += 1
                    if tokPos[0] >= tokPos[1]:
                        tokPos[0] = tokPos[1] - 1
                        break
                # Split merged names
                #newPos = [tokPos[0], tokPos[1]]
                #for split in sentenceText[tokPos[0]:tokPos[1]+1].split("/"):
                #    newPos[0] += len(split)
                #    if
            # Insert changed charOffset
            counts["entities"] += 1
            newOffset = tuple(tokPos)
            newOffsetString = Range.tuplesToCharOffset([newOffset])
            if verbose:
                print "Entity", entity.get("id"),
                #print [entity.get("text"), sentenceText[headOffset[0]:headOffset[1]+1], sentenceText[newOffset[0]:newOffset[1]+1]],
                print[
                    entity.get("text"),
                    sentenceText[headOffset[0]:headOffset[1]],
                    sentenceText[newOffset[0]:newOffset[1]]
                ],
                print[
                    entity.get("charOffset"),
                    entity.get("headOffset"), newOffsetString
                ], "Sent:", len(sentence.get("text")),
            if newOffset != headOffset:
                counts["extended"] += 1
                if verbose: print "EXTENDED",
            if newOffset == charOffset:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                incorrectCount += 1
                if verbose: print "INCORRECT"
            entity.set("charOffset", newOffsetString)
            #entity.set("text", sentenceText[newOffset[0]:newOffset[1]+1])
            entity.set("text", sentenceText[newOffset[0]:newOffset[1]])
        if incorrectCount > 0 and verbose:
            print "TOKENS:", "|".join(tokens)
            print "--------------------------------"
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
Exemplo n.º 20
0
def fixEntities(xml):
    counts = defaultdict(int)
    for sentence in xml.getiterator("sentence"):
        sText = sentence.get("text")
        for entity in sentence.findall("entity"):
            charOffset = entity.get("charOffset")
            if charOffset == "-":
                assert False, str(entity)
                sentence.remove(entity)
                counts["removed-invalid"] += 1
            else:
                charOffset = Range.charOffsetToSingleTuple(charOffset)
                # fix length
                realLength = len(entity.get("text"))
                lenDiff = (charOffset[1] - charOffset[0] + 1) - realLength
                if lenDiff != realLength:
                    counts["incorrect-ent-offset"] += 1
                    counts["incorrect-ent-offset-diff" + str(lenDiff)] += 1
                    if abs(lenDiff) > 2:
                        print "Warning, lenDiff:", (lenDiff, charOffset, sText,
                                                    entity.get("text"),
                                                    entity.get("id"))
                charOffset = (charOffset[0], charOffset[0] + realLength)
                # find starting position
                entIndex = sText.find(entity.get("text"), charOffset[0])
                if entIndex == -1:
                    for i in [-1, -2, -3]:
                        entIndex = sText.find(entity.get("text"),
                                              charOffset[0] + i)
                        if entIndex != -1:
                            break
                if entIndex != 0:  # could be lowercase
                    sTextLower = sText.lower()
                    for i in [0, -1, -2, -3]:
                        lowerEntIndex = sTextLower.find(
                            entity.get("text"), charOffset[0] + i)
                        if lowerEntIndex != -1:
                            break
                    if lowerEntIndex != -1 and abs(
                            lowerEntIndex -
                            charOffset[0]) < abs(entIndex - charOffset[0]):
                        entIndex = lowerEntIndex
                assert entIndex != -1, (charOffset, sText, entity.get("text"),
                                        entity.get("id"))
                indexDiff = entIndex - charOffset[0]
                if indexDiff != 0:
                    counts["incorrect-ent-index"] += 1
                    counts["incorrect-ent-index-diff" + str(indexDiff)] += 1
                    print "Warning, indexDiff:", (indexDiff, charOffset, sText,
                                                  entity.get("text"),
                                                  entity.get("id"))
                # move offset
                charOffset = (charOffset[0] + indexDiff,
                              charOffset[1] + indexDiff)
                # validate new offset
                sEntity = sText[charOffset[0]:charOffset[1]]
                assert sEntity == entity.get("text") or sEntity.lower(
                ) == entity.get("text"), (charOffset, sText,
                                          entity.get("text"), entity.get("id"))
                entity.set(
                    "charOffset",
                    Range.tuplesToCharOffset((charOffset[0], charOffset[1])))
                entity.set("given", "True")
        for interaction in sentence.findall("interaction"):
            interaction.set("type", "DDI")
    print "Fix counts:", counts
Exemplo n.º 21
0
def moveElements(document):
    entMap = {}
    entSentence = {}
    entSentenceIndex = {}
    sentences = document.findall("sentence")
    sentenceCount = 0
    for sentence in sentences:
        sentenceOffset = Range.charOffsetToSingleTuple(
            sentence.get("charOffset"))
        # Move entities
        entCount = 0
        for entity in document.findall("entity"):
            entityOffsets = Range.charOffsetToTuples(entity.get("charOffset"))
            overlaps = False
            for entityOffset in entityOffsets:
                if Range.overlap(sentenceOffset, entityOffset):
                    overlaps = True
                    break
            if overlaps:
                document.remove(entity)
                sentence.append(entity)
                entityId = entity.get("id")
                entityIdLastPart = entityId.rsplit(".", 1)[-1]
                if entityIdLastPart.startswith("e"):
                    entity.set("id",
                               sentence.get("id") + "." + entityIdLastPart)
                    entMap[entityId] = sentence.get(
                        "id") + "." + entityIdLastPart
                else:
                    entity.set("docId", entityId)
                    entity.set("id", sentence.get("id") + ".e" + str(entCount))
                    entMap[entityId] = sentence.get("id") + ".e" + str(
                        entCount)
                entSentence[entityId] = sentence
                entSentenceIndex[entityId] = sentenceCount
                #newEntityOffset = (entityOffset[0] - sentenceOffset[0], entityOffset[1] - sentenceOffset[0])
                newEntityOffsets = []
                for entityOffset in entityOffsets:
                    newOffset = (entityOffset[0] - sentenceOffset[0],
                                 entityOffset[1] - sentenceOffset[0])
                    newOffset = (max(0, newOffset[0]), max(0, newOffset[1]))
                    if newOffset != (0, 0):
                        assert newOffset[1] > newOffset[0], (entity.attrib,
                                                             entityOffsets,
                                                             sentenceOffset)
                        newEntityOffsets.append(
                            (entityOffset[0] - sentenceOffset[0],
                             entityOffset[1] - sentenceOffset[0]))
                assert len(newEntityOffsets) > 0, (entity.attrib,
                                                   entityOffsets,
                                                   sentenceOffset)
                entity.set("origOffset", entity.get("charOffset"))
                #entity.set("charOffset", str(newEntityOffset[0]) + "-" + str(newEntityOffset[1]))
                entity.set("charOffset",
                           Range.tuplesToCharOffset(newEntityOffsets))
                entCount += 1
        sentenceCount += 1
    if len([x for x in document.findall("entity")]) != 0:
        raise Exception(
            "Sentence splitting does not cover the entire document")
    # Move interactions
    intCount = 0
    interactions = []
    interactionOldToNewId = {}
    for interaction in document.findall("interaction"):
        interactions.append(interaction)
        #if entSentenceIndex[interaction.get("e1")] < entSentenceIndex[interaction.get("e2")]:
        #    targetSentence = entSentence[interaction.get("e1")]
        #else:
        #    targetSentence = entSentence[interaction.get("e2")]

        # Interactions go to a sentence always by e1, as this is the event they are an argument of.
        # If an intersentence interaction is a relation, this shouldn't matter.
        targetSentence = entSentence[interaction.get("e1")]
        document.remove(interaction)
        targetSentence.append(interaction)
        newId = targetSentence.get("id") + ".i" + str(intCount)
        interactionOldToNewId[interaction.get("id")] = newId
        interaction.set("id", newId)
        interaction.set("e1", entMap[interaction.get("e1")])
        interaction.set("e2", entMap[interaction.get("e2")])
        intCount += 1
    for interaction in interactions:
        if interaction.get("siteOf") != None:
            interaction.set("siteOf",
                            interactionOldToNewId[interaction.get("siteOf")])
Exemplo n.º 22
0
def convertChemProt(inDirs=None,
                    setNames=None,
                    outPath=None,
                    goldTestSet=True,
                    downloadDir=None,
                    extractDir=None,
                    redownload=False,
                    debug=False):
    tempDir = None
    if inDirs == None:
        print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------"
        if extractDir == None:
            tempDir = tempfile.mkdtemp()
        inDirs = []
        for setName in ("TRAIN", "DEVEL", "TEST"):
            if goldTestSet and setName == "TEST":
                setName = "TEST_GOLD"
            if Settings.URL["CP17_" + setName] != None:
                currentExtractDir = extractDir if extractDir else tempDir
                currentExtractDir = os.path.join(currentExtractDir,
                                                 setName.lower())
                inDirs.append(
                    downloadFile(Settings.URL["CP17_" + setName], downloadDir,
                                 currentExtractDir, redownload))
    print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames
    dataSets = OrderedDict()
    for inDir in inDirs:
        print >> sys.stderr, "Reading input directory", inDir
        filenames = os.listdir(inDir)
        filetypes = ["_abstracts", "_entities", "_relations"]
        # Collect the file paths for the data types
        dirDataSets = set()
        for filename in filenames:
            if not (filename.endswith(".tsv")
                    and any([x in filename for x in filetypes])):
                continue
            dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1)
            if setNames != None:
                dataSetId = setNames.get(dataSetId, dataSetId)
            dirDataSets.add(dataSetId)
            dataType = dataType.split(".")[0]
            if dataSetId not in dataSets:
                dataSets[dataSetId] = {}
            assert dataType not in dataSets[dataSetId]
            dataSets[dataSetId][dataType] = os.path.join(inDir, filename)
        print >> sys.stderr, "Found ChemProt datasets", list(
            dirDataSets), "at", inDir
    print >> sys.stderr, "Read datasets:", dataSets.keys()
    # Build the Interaction XML
    print >> sys.stderr, "Converting to Interaction XML"
    corpusName = "CP17"
    corpus = ET.Element("corpus", {"source": corpusName})
    counts = defaultdict(int)
    docById = {}
    entityById = {}
    entitiesByDoc = {}
    docsWithErrors = set()
    for dataSetId in sorted(dataSets.keys()):
        prevCounts = copy.copy(counts)
        print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---"
        dataSet = dataSets[dataSetId]
        counts["sets"] += 1
        with open(dataSet["abstracts"], "rt") as f:
            print >> sys.stderr, "Adding document elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["id", "title", "abstract"],
                    quoting=csv.QUOTE_NONE):
                document = ET.Element(
                    "document", {
                        "id": corpusName + ".d" + str(counts["documents"]),
                        "origId": row["id"],
                        "set": dataSetId
                    })
                document.set("text", row["title"] + " " + row["abstract"])
                document.set("titleOffset",
                             Range.tuplesToCharOffset((0, len(row["title"]))))
                if document.get("origId") in docById:
                    assert document.get("text") == docById[document.get(
                        "origId")].get("text")
                    assert document.get("titleOffset") == docById[document.get(
                        "origId")].get("titleOffset")
                    counts["duplicate-documents"] += 1
                else:
                    corpus.append(document)
                    docById[document.get("origId")] = document
                    counts["documents"] += 1
        with open(dataSet["entities"], "rt") as f:
            print >> sys.stderr, "Adding entity elements for dataset", dataSetId
            for row in UnicodeDictReader(
                    f,
                    delimiter="\t",
                    fieldnames=["docId", "id", "type", "begin", "end", "text"],
                    quoting=csv.QUOTE_NONE):
                document = docById[row["docId"]]
                assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N")
                # Check for duplicate entities
                if row["docId"] not in entitiesByDoc:
                    entitiesByDoc[row["docId"]] = set()
                assert row["id"] not in entitiesByDoc[row["docId"]]
                entitiesByDoc[row["docId"]].add(row["id"])
                # Determine the offset
                offset = (int(row["begin"]), int(row["end"]))
                docSpan = document.get("text")[offset[0]:offset[1]]
                if docSpan == row["text"]:
                    entity = ET.SubElement(
                        document, "entity", {
                            "id":
                            document.get("id") + ".e" +
                            str(len([x for x in document.findall("entity")]))
                        })
                    entity.set("given", "True")
                    entity.set("origId", row["id"])
                    entity.set("type", row["type"].split("-")[0])
                    entity.set(
                        "normalized",
                        "True" if row["type"].endswith("-Y") else "False")
                    entity.set(
                        "charOffset",
                        Range.tuplesToCharOffset((offset[0], offset[1])))
                    entity.set("text", row["text"])
                    if row["docId"] not in entityById:
                        entityById[row["docId"]] = {}
                    assert entity.get("origId") not in entityById[row["docId"]]
                    entityById[row["docId"]][entity.get("origId")] = entity
                    counts["entities"] += 1
                else:
                    print >> sys.stderr, "Alignment error in document", row[
                        "docId"], (offset, docSpan, row)
                    counts["entities-error"] += 1
                    docsWithErrors.add(row["docId"])
        if "relations" in dataSet:
            print >> sys.stderr, "Adding relation elements for dataset", dataSetId
            with open(dataSet["relations"], "rt") as f:
                for row in UnicodeDictReader(f,
                                             delimiter="\t",
                                             fieldnames=[
                                                 "docId", "group", "groupEval",
                                                 "type", "arg1", "arg2"
                                             ],
                                             quoting=csv.QUOTE_NONE):
                    for argId in ("1", "2"):
                        assert row["arg" + argId].startswith("Arg" + argId +
                                                             ":")
                        row["arg" + argId] = row["arg" + argId][5:]
                    document = docById[row["docId"]]
                    e1 = entityById[row["docId"]].get(row["arg1"])
                    e2 = entityById[row["docId"]].get(row["arg2"])
                    if e1 != None and e2 != None:
                        interaction = ET.SubElement(
                            document, "interaction", {
                                "id":
                                document.get("id") + ".i" + str(
                                    len([
                                        x for x in document.findall(
                                            "interaction")
                                    ]))
                            })
                        interaction.set("directed", "True")
                        interaction.set("type", row["group"])
                        interaction.set("relType", row["type"])
                        row["groupEval"] = row["groupEval"].strip()
                        assert row["groupEval"] in ("Y", "N")
                        interaction.set(
                            "evaluated",
                            "True" if row["groupEval"] == "Y" else "False")
                        interaction.set("e1", e1.get("id"))
                        interaction.set("e2", e2.get("id"))
                        counts["interactions"] += 1
                    else:
                        counts["interaction-error"] += 1
                        docsWithErrors.add(row["docId"])
        else:
            print >> sys.stderr, "No relations for dataset", dataSetId
        print >> sys.stderr, "dataset", dataSetId, {
            x: counts[x] - prevCounts.get(x, 0)
            for x in counts if counts[x] - prevCounts.get(x, 0) > 0
        }
    if len(docsWithErrors) > 0:
        counts["documents-with-errors"] = len(docsWithErrors)
    print >> sys.stderr, "---", "All Datasets Done", "---"
    print >> sys.stderr, "ChemProt conversion:", dict(counts)
    if tempDir != None and not debug:
        print >> sys.stderr, "Removing temporary directory", tempDir
        shutil.rmtree(tempDir)
    if outPath != None:
        ETUtils.write(corpus, outPath)
    return ET.ElementTree(corpus)