示例#1
0
def addMTMX(input, mtmxDir, output):
    from collections import defaultdict
    # read interaction XML
    print "Reading interaction XML"
    counts = defaultdict(int)
    xml = ETUtils.ETFromObj(input).getroot()
    docById = {}
    for document in xml.getiterator("document"):
        docId = document.get("origId")
        assert docId not in docById
        docById[docId] = document
        counts["document"] += 1
    for entity in xml.getiterator("entity"):
        counts["entity"] += 1
    
    # read MTMX files
    print "Processing MTMX"
    for filename in sorted(os.listdir(mtmxDir)):
        if filename.endswith(".xml"):
            print filename,
            fileId = filename.split("_")[0]
            if fileId not in docById:
                print "skipped"
                continue
            else:
                print "processing"
            doc = docById[fileId]
            entityByOrigId = {}
            for entity in doc.getiterator("entity"):
                assert entity.get("origId") not in entityByOrigId, entity.get("origId")
                entityByOrigId[entity.get("origId")] = entity
            mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot()
            for phrase in mtmx.getiterator("PHRASE"):
                if phrase.get("ID") in entityByOrigId:
                    entity = entityByOrigId[phrase.get("ID")]
                    mapCount = 0
                    for map in phrase.getiterator("MAP"):
                        if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()):
                            if entity.get("mtmxProb") != None:
                                if int(entity.get("mtmxProb")) > int(map.get("PROB")):
                                    break
                                else:
                                    counts["mapped-multi"] += 1
                                    counts["mapped-multi-"+str(mapCount)] += 1
                                    #print filename, phrase.get("ID")
                            else:
                                counts["mapped-at-least-once"] += 1
                            entity.set("mtmxProb", str(map.get("PROB")))
                            entity.set("mtmxCui", str(map.get("CUI")))
                            entity.set("mtmxName", str(map.get("NAME")))
                            entity.set("mtmxNameShort", str(map.get("NAME_SHORT")))
                            entity.set("mtmxSemTypes", str(map.get("SEMTYPES")))
                            counts["mappings"] += 1
                            mapCount += 1
    print counts
    ETUtils.write(xml, output)
def loadEventXML(path, verbose=False):
    xml = ETUtils.ETFromObj(path)
    sentDict = {}
    for sentence in xml.getiterator("sentence"):
        sentenceText = getText(sentence).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []

    for event in xml.getiterator("event"):
        sentenceText = getText(event).strip()
        if not sentDict.has_key(sentenceText):
            sentDict[sentenceText] = []
        events = sentDict[sentenceText]

        clue = event.find("clue")
        clueTuple = getClue(clue)
        eventType = event.find("type").get("class")
        if eventType == "Protein_amino_acid_phosphorylation":
            eventType = "Phosphorylation"
        if type(clueTuple) == types.StringType:
            if verbose:
                print "Event", eventType, "clue with no clueType:", ETUtils.toStr(
                    clue)
        else:
            assert sentenceText[clueTuple[1]:clueTuple[2] +
                                1] == clueTuple[0], (
                                    sentenceText,
                                    sentenceText[clueTuple[1]:clueTuple[2] +
                                                 1], clueTuple)
            event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0])
            if event not in events:
                events.append(event)
    return sentDict
示例#3
0
def convert(input, output=None, outputRoot=None):
    print >> sys.stderr, "##### Convert PMC to Interaction XML #####"

    print >> sys.stderr, "Loading corpus", input
    pmcTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    pmcRoot = pmcTree.getroot()

    includeElements = [
        "front", "article-meta", "title-group", "article-title", "abstract",
        "body", "sec", "p", "title"
    ]
    collapseElements = ["front", "article-meta", "title-group", "p"]

    if outputRoot == None:
        outputRoot = ET.Element("corpus")
        outputRoot.set("source", "PMC")

    outputRoot.append(addElements(pmcRoot, includeElements, collapseElements))

    outputTree = ET.ElementTree(outputRoot)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(outputTree, output)
    return outputTree
示例#4
0
 def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False):
     """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream
     multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf)
     multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence
     the root of the modified tree is returned and, if outFile is a string, written out to outFile as well"""
     print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####"
     tree=ETUtils.ETFromObj(inFile)
     if not ET.iselement(tree):
         assert isinstance(tree,ET.ElementTree)
         root=tree.getroot()
     else:
         root = tree
     
     if multiplier != -1:
         if binary:
             print >> sys.stderr, "Recall binary mode"
             classRanges = getClassRanges(root.getiterator("entity"))
             assert len(classRanges.keys()) in [0,2]
             if len(classRanges.keys()) == 0:
                 print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found"
         else:
             print >> sys.stderr, "Recall multiclass mode"
             classRanges = None
         for entityNode in root.getiterator("entity"):
             adjustEntity(entityNode,targetLabel,multiplier,classRanges)
     if outFile:
         ETUtils.write(root,outFile)
     return tree
示例#5
0
 def convert(self, input, dataSetNames=None, corpusName=None, output=None):
     if os.path.isdir(input) or input.endswith(".tar.gz") or "," in input:
         print >> sys.stderr, "Converting ST-format to Interaction XML"
         dataSetDirs = input
         documents = []
         if type(dataSetDirs) in types.StringTypes:
             dataSetDirs = dataSetDirs.split(",")
         if dataSetNames == None:
             dataSetNames = []
         elif type(dataSetNames) in types.StringTypes:
             dataSetNames = dataSetNames.split(",")
         for dataSetDir, dataSetName in itertools.izip_longest(
                 dataSetDirs, dataSetNames, fillvalue=None):
             print >> sys.stderr, "Reading", dataSetDir, "set,",
             docs = STFormat.STTools.loadSet(dataSetDir, dataSetName)
             print >> sys.stderr, len(docs), "documents"
             documents.extend(docs)
         print >> sys.stderr, "Resolving equivalences"
         STFormat.Equiv.process(documents)
         self.xml = STFormat.ConvertXML.toInteractionXML(
             documents, self.intermediateFileTag, output)
     else:
         print >> sys.stderr, "Processing source as interaction XML"
         self.xml = ETUtils.ETFromObj(input)
     return self.xml
示例#6
0
def processCorpus(input, attrs=["text"]):
    print attrs
    print >> sys.stderr, "Loading corpus file", input
    corpusRoot = ETUtils.ETFromObj(input).getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {}
    interactors = {}
    for document in documents:
        entDict = {}
        for entity in document.getiterator("entity"):
            entDict[entity.get("id")] = entity
        for interaction in document.getiterator("interaction"):
            e1 = entDict[interaction.get("e1")]
            e2 = entDict[interaction.get("e2")]
            # form identifier tuples
            e1Tuple = []
            for attr in attrs:
                e1Tuple.append(e1.get(attr))
            e1Tuple = tuple(e1Tuple)
            e2Tuple = []
            for attr in attrs:
                e2Tuple.append(e2.get(attr))
            e2Tuple = tuple(e2Tuple)
            interactors = [e1Tuple, e2Tuple]
            #interactors.sort()
            print interactors
示例#7
0
def removeUnconnectedEntities(input, output=None):
    input = ETUtils.ETFromObj(input)
    root = input.getroot()
    removed = 0
    preserved = 0
    for document in root.findall("document"):
        sentMap = {}  # allow for intersentence interactions
        for sentence in document.findall("sentence"):
            sentMap[sentence.get("id")] = sentence
        connected = set()
        for interaction in document.getiterator("interaction"):
            connected.add(interaction.get("e1"))
            connected.add(interaction.get("e2"))
        entities = []
        for entity in document.getiterator("entity"):
            entities.append(entity)
        for entity in entities:
            if entity.get("isName") == "True":  # never remove named entities
                continue
            eId = entity.get("id")
            if eId not in connected:
                if eId.find(".s") != -1:  # sentence level entity
                    sentMap[eId.rsplit(".", 1)[0]].remove(entity)
                else:  # document level entity
                    document.remove(entity)
                removed += 1
            else:
                preserved += 1

    print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities"

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(root, output)
    return input
示例#8
0
文件: BIFull.py 项目: thiagoki/Tdevel
def makeSubset(filename, output, ratio, seed):
    if ratio == 1.0:
        return filename
    totalFolds = 100
    selectedFolds = int(ratio * 100.0)
    print >> sys.stderr, "====== Making subset ======"
    print >> sys.stderr, "Subset for file", filename, "ratio", ratio, "seed", seed
    import cElementTreeUtils as ETUtils
    import Core.Split
    xml = ETUtils.ETFromObj(filename).getroot()
    count = 0
    sentCount = 0
    for document in xml.findall("document"):
        sentCount += len(document.findall("sentence"))
        count += 1
    division = Core.Split.getFolds(count, totalFolds, seed)
    #print division, selectedFolds - 1
    index = 0
    removeCount = 0
    sentRemoveCount = 0
    for document in xml.findall("document"):
        if division[index] > selectedFolds - 1:
            xml.remove(document)
            sentRemoveCount += len(document.findall("sentence"))
            removeCount += 1
        index += 1
    print "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount
    ETUtils.write(xml, output)
    return output
示例#9
0
def fixAltOffsets(input, output=None):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    docCount = 0
    sentencesCreated = 0
    sentences = [x for x in corpusRoot.getiterator("sentence")]
    counter = ProgressCounter(len(sentences), "FixAltOffsets")
    fixCount = 0
    # fix spans
    for sentence in sentences:
        counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ")
        sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset"))
        for entity in sentence.findall("entity"):
            altOffsetString = entity.get("altOffset")
            if altOffsetString == None:
                continue
            #print altOffsetString
            altOffsets = Range.charOffsetToTuples(altOffsetString)
            assert len(altOffsets) == 1
            for i in range(len(altOffsets)):
                altOffset = altOffsets[i] 
                altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0])
            entity.set("altOffset", Range.tuplesToCharOffset(altOffsets))
            fixCount += 1
        
    print >> sys.stderr, "Fixed", fixCount, "altOffsets"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
示例#10
0
    def run(cls,
            fileIn,
            fileOut=None,
            tokenization="split-Charniak-Lease",
            entityOffsetKey="charOffset",
            includeNeg=False,
            stem=False):
        """Builds the master gazzeteer.
        fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element
        fileOut: a string or None. If given, the resulting gazzetteer will be written out
        tokenization: name of the tokenization to be used

        Produces a dictionary with...
        """

        print >> sys.stderr, "Building gazetteer"

        gztr = {}  #key: token value: dictionary (key: className, value count)
        root = ETUtils.ETFromObj(fileIn)
        if not ET.iselement(root):
            assert isinstance(root, ET.ElementTree)
            root = root.getroot()
        sentences = []
        for sNode in root.getiterator("sentence"):
            sentences.append(sNode)
        counter = ProgressCounter(len(sentences), "Build gazetteer")
        for sNode in sentences:
            counter.update(
                1, "Adding to gazetteer sentence " + sNode.get("id") + ", ")
            for tokenizationNode in sNode.getiterator("tokenization"):
                if tokenizationNode.get("tokenizer") == tokenization:
                    break
            else:
                assert False, "Did not find %s tokenization" % tokenization
            tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey)
            assert len(tClasses) == len(tokenizationNode)
            for tokIdx, tokNode in enumerate(tokenizationNode):
                gsClass = tClasses[tokIdx]
                b, e = charOffStr2tuple(tokNode.get("charOffset"))
                tokNodeTxt = tokTxt(b, e, sNode, stem).lower()
                tokDict = gztr.setdefault(tokNodeTxt, {})
                tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                # for multi-part texts, add collapsed and last token versions
                if tokNodeTxt.find("-") != -1:
                    # collapsed
                    text = tokNodeTxt.replace("-", "")
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
                    # last part
                    text = tokNodeTxt.rsplit("-", 1)[-1]
                    if text != "":
                        tokDict = gztr.setdefault(text, {})
                        tokDict[gsClass] = tokDict.get(gsClass, 0) + 1
        if fileOut:
            Gazetteer.saveGztr(gztr, fileOut, includeNeg)
        return gztr
示例#11
0
def removeHeads(corpus):
    print >> sys.stderr, "Removing existing head offsets"
    removeCount = 0
    xml = ETUtils.ETFromObj(corpus)
    for d in xml.getroot().findall("document"):
        for s in d.findall("sentence"):
            for e in s.findall("entity"):
                if e.get("headOffset") != None:
                    removeCount += 1
                    del e.attrib["headOffset"]
    print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
    return [0, removeCount]
示例#12
0
文件: prune.py 项目: thiagoki/Tdevel
def interface(optionArgs=sys.argv[1:]):
    """
    The function to handle the command-line interface.
    """
    from optparse import OptionParser

    op = OptionParser(usage="%prog [options]\nGenia shared task specific pruning of invalid nodes and edges.")
    op.add_option("-i", "--infile",
                  dest="infile",
                  help="Input file (gifxml)",
                  metavar="FILE")
    op.add_option("-o", "--outfile",
                  dest="outfile",
                  help="Output file (gifxml)",
                  metavar="FILE")
    op.add_option("-c", "--cycles",
                  dest="cycles",
                  help="Remove cycles (requires the presence of 'predictions' attribute in 'interaction' elements)",
                  default=False,
                  action="store_true")
    (options, args) = op.parse_args(optionArgs)

    quit = False
    if not options.infile:
        print "Please specify the input file."
        quit = True
#    if not options.outfile:
#        print "Please specify the output file."
#        quit = True
    if quit:
        op.print_help()
        return(False)

    corpus = ETUtils.ETFromObj(options.infile)
    cycleBrokenCount = 0
    skipCount = 0
    for document in corpus.getroot().findall('document'):
        for sentence in document.findall("sentence"):
            #sys.stderr.write("Pruning document %s\n"%document.attrib['id'])
            pruner = Pruner(sentence)
            pruner.analyse()
            if options.cycles:
                cycleBrokenCount += pruner.analyseCycles()
            pruner.prune()
    sys.stderr.write("File pruned, broke " + str(cycleBrokenCount) + " cycles\n" )
    if skipCount > 0:
        sys.stderr.write("Pruning skipped " + str(skipCount) + " sentences\n" )
    if options.outfile:
        ETUtils.write(corpus, options.outfile)
    return corpus
示例#13
0
def negateEvents(input, output=None, verbose=False):
    if not (ET.iselement(input) and input.tag == "sentence"):
        print >> sys.stderr, "Loading corpus file", input
        corpusTree = ETUtils.ETFromObj(input)
        corpusRoot = corpusTree.getroot()

    if not (ET.iselement(input) and input.tag == "sentence"):
        sentences = corpusRoot.getiterator("sentence")
    else:
        sentences = [input]
    counts = defaultdict(int)
    for sentence in sentences:
        for entity in sentence.findall("entity"):
            counts["all-entities"] += 1
            eType = entity.get("type")
            if not isNegatableEPITrigger(eType):
                counts["out-of-scope"] += 1
                continue
            eBaseType = getEPIBaseType(eType)
            eText = entity.get("text").lower()
            eNewType = determineNewType(eType, eText)

            # Insert changed charOffset
            counts["entities"] += 1
            if verbose:
                print "Entity", entity.get("id"), [entity.get("text")], [
                    eType, eBaseType, eNewType
                ],
            if eNewType != eBaseType:
                counts["negated"] += 1
                if verbose: print "NEGATED",
            if eNewType == eType:
                counts["correct"] += 1
                if verbose: print "CORRECT"
            else:
                counts["incorrect"] += 1
                if eNewType == eBaseType:
                    counts["incorrect-pos"] += 1
                else:
                    counts["incorrect-neg"] += 1
                if verbose: print "INCORRECT"
            entity.set("type", eNewType)
    if verbose:
        print counts

    if not (ET.iselement(input) and input.tag == "sentence"):
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusRoot, output)
        return corpusTree
示例#14
0
def getEmptyCorpus(xml, deletionRules=None):
    """
    A convenience function for getting an empty corpus, useful for testing for information leaks
    in the event extraction process.
    """
    if type(xml) in types.StringTypes:
        # XML is read from disk, so it's a new copy and can be safely modified
        xml = ETUtils.ETFromObj(xml)
    else:
        # XML is already an object in memory. To prevent problems with other users of it, a copy
        # is created before deleting elements.
        xml = copy.deepcopy(xml)
    if deletionRules == None:  # use default rules for BioNLP Shared Task
        # We remove all interactions, and all entities that are not named entities. This leaves only
        # the gold standard protein/gene names
        deletionRules = {"interaction": {}, "entity": {"isName": "False"}}
    # Remove elements and return the emptied XML
    return processCorpus(xml, None, deletionRules)
示例#15
0
def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("isName") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict
示例#16
0
def loadCorpus(filename,
               parse=None,
               tokenization=None,
               removeIntersentenceInteractions=True,
               removeNameInfo=False):
    try:
        import xml.etree.cElementTree as ET
    except ImportError:
        import cElementTree as ET
    import sys, gzip

    if type(filename) == types.StringType:
        print >> sys.stderr, "Loading corpus file", filename
    corpusTree = ETUtils.ETFromObj(filename)
    corpusRoot = corpusTree.getroot()
    return CorpusElements(corpusRoot, parse, tokenization,
                          removeIntersentenceInteractions, corpusTree,
                          removeNameInfo)
示例#17
0
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False):
    print >> sys.stderr, "Loading Static Relations"
    events = {}
    for srFile in srFiles:
        readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText)
    
    if xmlFileName != None:
        xmlEvents = {}
        dataSets = {}
        srTexts = {} # original, unnormalized sentence texts from the SR corpus
        eventsToXML(events, xmlEvents, dataSets, srTexts)
        
        print >> sys.stderr, "Loading XML"
        xml = ETUtils.ETFromObj(xmlFileName)
        print >> sys.stderr, "Inserting XML events"
        insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        # update pre-existing parses
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")])
        print >> sys.stderr, "Converting back"
        STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False)
        STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False)
    else:
        xml = eventsToNewXML(events)
        xmlTree = ET.ElementTree(xml)
        ETUtils.write(xml, outdir+corpusName+"-srevents.xml")
        xml = xmlTree
        # Parse
        bigfileName = outdir+corpusName
        print >> sys.stderr, "Parsing"
        Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60)
        print >> sys.stderr, "Stanford Conversion"
        Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml")
        print >> sys.stderr, "Protein Name Splitting"
        splitTarget = "McClosky"
        xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget)
        print >> sys.stderr, "Head Detection"
        xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True)
        print >> sys.stderr, "Dividing into sets"
        InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")    
示例#18
0
def parseStats(input):
    print >> sys.stderr, "Loading input file", input
    inputTree = ETUtils.ETFromObj(input)
    inputRoot = inputTree.getroot()
    counts = defaultdict(int)
    for sentence in inputRoot.getiterator("sentence"):
        counts["sentence"] += 1
        analysesElement = sentence.find("sentenceanalyses")
        if analysesElement == None:
            counts["sentence-no-analyses"] += 1
            continue
        # Create parses element (if needed)
        parsesElement = analysesElement.find("parses")
        if parsesElement == None:
            counts["sentence-no-parses"] += 1
            continue
        # Loop through parses
        for parseElement in parsesElement:
            parserName = parseElement.get("parser")
            counts["parse:" + parserName] += 1
            if parseElement.get("pennstring") in ["", None]:
                counts["parse:" + parserName + "(no penn)"] += 1
            if len(parseElement.findall("dependency")) == 0:
                counts["parse:" + parserName + "(no dependencies)"] += 1
            if len(parseElement.findall("phrase")) == 0:
                counts["parse:" + parserName + "(no phrases)"] += 1
        # Tokenizations
        tokenizationsElement = analysesElement.find("tokenizations")
        if tokenizationsElement == None:
            counts["sentence-no-tokenizations"] += 1
            continue
        # Loop through tokenizations
        for tokenizationElement in tokenizationsElement:
            tokenizerName = tokenizationElement.get("tokenizer")
            counts["tokenization:" + tokenizerName] += 1
            if len(tokenizationElement.findall("token")) == 0:
                counts["tokenization:" + tokenizerName + "(no tokens)"] += 1

    print >> sys.stderr, "Parse statistics for", input
    for key in sorted(counts.keys()):
        print >> sys.stderr, " ", key + ":", counts[key]
示例#19
0
def findHeads(corpus, stringsFrom, methods, parse, tokenization):
    for m in methods:
        assert m in ["REMOVE", "SYNTAX", "DICT"]
    corpus = ETUtils.ETFromObj(corpus)
    counts = {}
    for method in methods:
        print >> sys.stderr, method, "pass"
        if method == "REMOVE":
            counts[method] = removeHeads(corpus)
        elif method == "DICT":
            counts[method] = findHeadsDictionary(corpus, stringsFrom, parse,
                                                 tokenization)
        elif method == "SYNTAX":
            counts[method] = findHeadsSyntactic(corpus, parse, tokenization)
        print >> sys.stderr, method, "pass added", counts[method][
            0], "and removed", counts[method][1], "heads"

    print >> sys.stderr, "Summary (pass/added/removed):"
    for method in methods:
        print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[
            method][1]
示例#20
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            if document.get("pmid") in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(document.get("pmid"))
        assert len(docOrigIds) == 0, docOrigIds

    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None

        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
示例#21
0
def processEquivs(input, output):
    tree = ETUtils.ETFromObj(input)
    root = tree.getroot()
    entitiesById = getElementsById("entity", root)
    sentencesById = getElementsById("sentence", root)
    interactions = list(root.getiterator("interaction")) + list(
        root.getiterator("pair"))

    # map entities to their incoming interactions
    interactionsByEntity = {}
    for interaction in interactions:
        # Equivs are not interactions that we want to duplicate
        if interaction.get("type") == "Equiv":
            continue
        # outgoing
        e1 = interaction.get("e1")
        if not interactionsByEntity.has_key(e1):
            interactionsByEntity[e1] = []
        interactionsByEntity[e1].append(interaction)
        # incoming
        e2 = interaction.get("e2")
        if not interactionsByEntity.has_key(e2):
            interactionsByEntity[e2] = []
        interactionsByEntity[e2].append(interaction)

    count = 0
    for interaction in interactions:
        print >> sys.stderr, "Processing interaction", count, "out of", len(
            interactions)
        if interaction.get("type") == "Equiv":
            e1 = entitiesById[interaction.get("e1")]
            e2 = entitiesById[interaction.get("e2")]
            duplicateFlat(e2, e1, entitiesById, sentencesById,
                          interactionsByEntity)
            # remove equiv
            sentenceId = interaction.get("id").rsplit(".", 1)[0]
            sentencesById[sentenceId].remove(interaction)
        count += 1
    ETUtils.writeUTF8(root, output)
示例#22
0
def splitMergedElements(inputFilename, outputFilename=None):
    print >> sys.stderr, "##### Split elements with merged types #####"
    print >> sys.stderr, "Loading corpus", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]}
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, countsByType)
    print >> sys.stderr, "Results"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ": removed", countsByType[k][
            0], "created", countsByType[k][1]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
示例#23
0
def loadDocs(url, outDir, tempDir):
    inDir = Utils.Download.downloadAndExtract(url, tempDir, outDir + "/DDI11-original")[0]
    inDir = os.path.join(tempDir, inDir)
            
    print "Loading documents from", inDir
    sentences = {"positive":[], "negative":[]}
    docCounts = {}
    docById = {}
    documents = []
    for filename in sorted(os.listdir(inDir)):
        if filename.endswith(".xml"):
            print "Reading", filename,
            xml = ETUtils.ETFromObj(os.path.join(inDir, filename))
            for document in xml.getiterator("document"):
                counts = [0,0]          
                for sentence in document.findall("sentence"):
                    #sentence.set("document.get("origId") + "." + sentence.get("origId"))
                    truePairs = False
                    for pair in sentence.findall("pair"):
                        if pair.get("interaction") == "true":
                            truePairs = True
                            break
                    if truePairs:
                        counts[0] += 1
                        sentences["positive"].append(sentence)
                    else:
                        counts[1] += 1
                        sentences["negative"].append(sentence)
                assert document.get("id") not in docCounts
                docCounts[document.get("id")] = counts
                docById[document.get("id")] = document
                documents.append(document)
                print counts,
                #print ETUtils.toStr(document)
            print
    print "Positive sentences:", len(sentences["positive"])
    print "Negative sentences:", len(sentences["negative"])
    return documents, docById, docCounts
示例#24
0
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False):
    data = defaultdict(lambda : defaultdict(list))
    print "Loading DrugBank XML"
    xml = ETUtils.ETFromObj(filename)
    print "Processing DrugBank XML"
    root = xml.getroot()
    assert root.tag == preTag+"drugs", root.tag
    for drug in root.findall(preTag+"drug"):
        id = drug.find(preTag+"drugbank-id").text
        name = drug.find(preTag+"name").text
        if verbose: print id, name
        assert id not in data
        data[id]["name"] = name
        # TODO: Enzymes & targets
        # TODO: hydrophobicity
        getNestedItems(drug, "synonym", data[id], preTag)
        getNestedItems(drug, "brand", data[id], preTag)
        getNestedItems(drug, "group", data[id], preTag)
        getNestedItems(drug, "category", data[id], preTag, "categories")
        interactions = drug.find(preTag+"drug-interactions").findall(preTag+"drug-interaction")
        for interaction in interactions:
            data[id]["interaction"].append( [interaction.find(preTag+"drug").text, None, interaction.find(preTag+"description").text,] )
    return data
示例#25
0
def getHeads(corpus):
    corpus = ETUtils.ETFromObj(corpus)
    headDict = {}
    headDict["None"] = {}
    for sentence in corpus.getiterator("sentence"):
        headOffsetStrings = set()
        for entity in sentence.findall("entity"):
            eType = entity.get("type")
            if not headDict.has_key(eType):
                headDict[eType] = {}
            eText = entity.get("text")
            headOffset = entity.get("headOffset")
            headOffsetStrings.add(headOffset)
            headOffset = Range.charOffsetToSingleTuple(headOffset)
            charOffset = Range.charOffsetToSingleTuple(
                entity.get("charOffset"))
            if headOffset == charOffset:
                if not headDict[eType].has_key(eText):
                    headDict[eType][eText] = 0
                headDict[eType][eText] += 1
            else:
                headText = sentenceText[headOffset[0] -
                                        charOffset[0]:headOffset[1] -
                                        charOffset[0] + 1]
                if not headDict[eType].has_key(headText):
                    headDict[eType][headText] = 0
                headDict[eType][headText] += 1
        for token in tokens:
            if not token.get(
                    "charOffset"
            ) in headOffsetStrings:  # token is not the head of any entity
                headText = token.get("text")
                if not headDict["None"].has_key(headText):
                    headDict["None"][headText] = 0
                headDict["None"][headText] += 1

    return headDict
示例#26
0
def makeDDISubmissionFile(input, output):
    xml = ETUtils.ETFromObj(input)
    outFile = open(output, "wt")
    for sentence in xml.getiterator("sentence"):
        # First determine which pairs interact
        intMap = defaultdict(lambda:defaultdict(lambda:None))
        for interaction in sentence.findall("interaction"):
            # Make mapping both ways to discard edge directionality. This isn't actually needed,
            # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function,
            # but shouldn't harm to include it and now it works regardless of pair direction.
            if interaction.get("type") != "neg":
                intMap[interaction.get("e1")][interaction.get("e2")] = interaction
                intMap[interaction.get("e2")][interaction.get("e1")] = interaction
        # Then write all pairs to the output file
        entities = sentence.findall("entity")
        for i in range(0, len(entities)-1):
            for j in range(i+1, len(entities)):
                eIId = entities[i].get("id")
                eJId = entities[j].get("id")
                outFile.write(eIId + "\t" + eJId + "\t")
                if intMap[eIId][eJId] != None:
                    outFile.write("1\n")
                else:
                    outFile.write("0\n")
示例#27
0
def processCorpus(inputFilename, outputFilename, rules):
    print >> sys.stderr, "Loading corpus file", inputFilename
    corpusTree = ETUtils.ETFromObj(inputFilename)
    corpusRoot = corpusTree.getroot()

    for eType in rules.keys():
        for attrRule in rules[eType].keys():
            rules[eType][attrRule] = rules[eType][attrRule].split("|")

    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    countsByType = defaultdict(int)
    for document in documents:
        counter.update()
        for sentence in document.findall("sentence"):
            processSentence(sentence, rules, countsByType)
    print >> sys.stderr, "Removed"
    for k in sorted(countsByType.keys()):
        print >> sys.stderr, "  " + k + ":", countsByType[k]

    if outputFilename != None:
        print >> sys.stderr, "Writing output to", outputFilename
        ETUtils.write(corpusRoot, outputFilename)
    return corpusTree
示例#28
0
 def process(self,
             input,
             outDir,
             fromStep=None,
             toStep=None,
             omitSteps=None):
     self.initVariables(source=input, xml=input, outDir=outDir)
     self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps],
                     fromStep, toStep, omitSteps)
     # Run the tools
     savedOutput = None  # Output from a previous step if "fromStep" is used
     for step in self.steps:
         if self.checkStep(step[0]):
             if savedOutput != None:  # A previous run of the program saved an intermediate file
                 self.xml = ETUtils.ETFromObj(savedOutput)
                 savedOutput = None
             stepArgs = copy.copy(
                 step[2]
             )  # make a copy of the arguments to which i/o can be added
             stepArgs[step[4]["input"]] = self.xml  # the input
             if self.getIntermediateFilePath(
                     step
             ) != None:  # this step should save an intermediate file
                 stepArgs[step[4]["output"]] = self.getIntermediateFilePath(
                     step)
             step[1](**stepArgs)  # call the tool
         elif self.getStepStatus(
                 step[0]) == "BEFORE":  # this step was run earlier
             savedOutput = self.getIntermediateFilePath(step)
     # End state and return
     xml = self.xml  # state-specific member variable self.xml will be removed when exiting state
     self.exitState()
     if self.state == None:  # if the whole toolchain has finished, return the final product
         return xml
     else:
         return None
示例#29
0
def tokenize(input,
             output=None,
             tokenizationName="GeniaTagger-3.0.1",
             extraFields=[]):  #["base", "chunk", "NE"]):
    global geniaTaggerDir

    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()

    # Write text to input file
    workdir = tempfile.mkdtemp()
    infile = codecs.open(os.path.join(workdir, "tagger-input.txt"), "wt",
                         "utf-8")
    numCorpusSentences = 0
    for sentence in corpusRoot.getiterator("sentence"):
        infile.write(sentence.get("text") + "\n")
        numCorpusSentences += 1
    infile.close()

    # Run tagger
    cwd = os.getcwd()
    os.chdir(geniaTaggerDir)
    args = [geniaTaggerDir + "/geniatagger"]
    #args += [ "<", os.path.join(workdir, "tagger-input.txt")]
    #args += [ ">", os.path.join(workdir, "tagger-output.txt")]
    #subprocess.call(args,
    process = subprocess.Popen(
        args,
        stdin=codecs.open(os.path.join(workdir, "tagger-input.txt"), "rt",
                          "utf-8"),
        stdout=codecs.open(os.path.join(workdir, "tagger-output.txt"), "wt",
                           "utf-8"))
    waitForProcess(process, numCorpusSentences, True,
                   os.path.join(workdir, "tagger-output.txt"), "GeniaTagger",
                   "Tokenizing Sentences")
    os.chdir(cwd)

    # Read tokenization
    outfile = codecs.open(os.path.join(workdir, "tagger-output.txt"), "rt",
                          "utf-8")
    # Add output to sentences
    for sentence in corpusRoot.getiterator("sentence"):
        # Find or create container elements
        sentenceAnalyses = sentence.find("sentenceAnalyses")
        if sentenceAnalyses == None:
            sentenceAnalyses = ET.Element("sentenceAnalyses")
            sentence.append(sentenceAnalyses)
        tokenizations = sentenceAnalyses.find("tokenizations")
        if tokenizations == None:
            tokenizations = ET.Element("tokenizations")
            sentenceAnalyses.append(tokenizations)
        prevTokenizationIndex = 0
        for prevTokenization in tokenizations.findall("tokenization"):
            assert prevTokenization.get("tokenizer") != tokenizationName
            prevTokenizationIndex += 1
        tokenization = ET.Element("tokenization")
        tokenization.set("tokenizer", tokenizationName)
        tokenizations.insert(prevTokenizationIndex, tokenization)

        sText = sentence.get("text")
        start = 0
        tokenCount = 0
        line = outfile.readline()
        while line.strip() != "":
            # Add tokens
            splits = line.strip().split("\t")
            # Determine offsets
            cStart = sText.find(splits[0], start)
            if cStart == -1:
                if splits[0] == "``":
                    splits[0] = "\""
                if splits[0] == "''":
                    splits[0] = "\""
                cStart = sText.find(splits[0], start)
            assert cStart != -1, (sentence.get("id"), sText, line, tokenCount)
            cEnd = cStart + len(splits[0])
            start = cStart + len(splits[0])
            # Make element
            token = ET.Element("token")
            token.set("id", "gt_" + str(tokenCount + 1))
            token.set("text", splits[0])
            if "base" in extraFields:
                token.set("base", splits[1])
            token.set("POS", splits[2])
            if "chunk" in extraFields:
                token.set("chunk", splits[3])
            if "NE" in extraFields:
                token.set("NE", splits[4])
            token.set("charOffset",
                      str(cStart) + "-" + str(cEnd - 1))  # NOTE: check
            tokenization.append(token)
            tokenCount += 1
            line = outfile.readline()

    outfile.close()
    # Remove work directory
    shutil.rmtree(workdir)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
示例#30
0
def interface(optionArgs=sys.argv[1:]):
    """
    The function to handle the command-line interface.
    """
    from optparse import OptionParser

    op = OptionParser(
        usage="%prog [options]\nGenia shared task specific unflattening.")
    op.add_option("-i",
                  "--infile",
                  dest="infile",
                  help="Input file (gifxml)",
                  metavar="FILE")
    op.add_option("-o",
                  "--outfile",
                  dest="outfile",
                  help="Output file (gifxml)",
                  metavar="FILE")
    op.add_option(
        "-p",
        "--perfect",
        dest="perfect",
        help="Process only those event which can be perfectly solved",
        action="store_true",
        default=False)
    op.add_option("-a",
                  "--parse",
                  dest="parse",
                  help="Parse to be used",
                  metavar="PARSE")
    op.add_option("-t",
                  "--tokens",
                  dest="tokens",
                  help="Tokens to be used",
                  metavar="TOKENS")
    (options, args) = op.parse_args(optionArgs)

    quit = False
    if not options.infile:
        print "Please specify the input file."
        quit = True


#    if not options.outfile:
#        print "Please specify the output file."
#        quit = True
    if not options.parse:
        print "Please specify the parse."
        quit = True
    if not options.tokens:
        print "Please specify the tokenisation."
        quit = True
    if quit:
        op.print_help()
        return (False)

    corpus = ETUtils.ETFromObj(options.infile)
    documents = corpus.getroot().findall('document')
    counter = ProgressCounter(len(documents), "Unflatten")
    for document in documents:
        counter.update(1, "Unflattening (" + document.get("id") + "): ")
        #sys.stderr.write("Unflattening document %s\n"%document.attrib['id'])
        unflattener = Unflattener(document, options.perfect, options.tokens,
                                  options.parse)
        #if len(unflattener.tokens) == 0:
        #    continue
        unflattener.analyse()
        unflattener.unflatten()
    #indent(corpus.getroot())
    if options.outfile:
        ETUtils.write(corpus, options.outfile)
    return corpus