def addMTMX(input, mtmxDir, output): from collections import defaultdict # read interaction XML print "Reading interaction XML" counts = defaultdict(int) xml = ETUtils.ETFromObj(input).getroot() docById = {} for document in xml.getiterator("document"): docId = document.get("origId") assert docId not in docById docById[docId] = document counts["document"] += 1 for entity in xml.getiterator("entity"): counts["entity"] += 1 # read MTMX files print "Processing MTMX" for filename in sorted(os.listdir(mtmxDir)): if filename.endswith(".xml"): print filename, fileId = filename.split("_")[0] if fileId not in docById: print "skipped" continue else: print "processing" doc = docById[fileId] entityByOrigId = {} for entity in doc.getiterator("entity"): assert entity.get("origId") not in entityByOrigId, entity.get("origId") entityByOrigId[entity.get("origId")] = entity mtmx = ETUtils.ETFromObj(os.path.join(mtmxDir, filename)).getroot() for phrase in mtmx.getiterator("PHRASE"): if phrase.get("ID") in entityByOrigId: entity = entityByOrigId[phrase.get("ID")] mapCount = 0 for map in phrase.getiterator("MAP"): if (map.get("NAME").lower() == entity.get("text").lower()) or (map.get("NAME_SHORT").lower() == entity.get("text").lower()): if entity.get("mtmxProb") != None: if int(entity.get("mtmxProb")) > int(map.get("PROB")): break else: counts["mapped-multi"] += 1 counts["mapped-multi-"+str(mapCount)] += 1 #print filename, phrase.get("ID") else: counts["mapped-at-least-once"] += 1 entity.set("mtmxProb", str(map.get("PROB"))) entity.set("mtmxCui", str(map.get("CUI"))) entity.set("mtmxName", str(map.get("NAME"))) entity.set("mtmxNameShort", str(map.get("NAME_SHORT"))) entity.set("mtmxSemTypes", str(map.get("SEMTYPES"))) counts["mappings"] += 1 mapCount += 1 print counts ETUtils.write(xml, output)
def loadEventXML(path, verbose=False): xml = ETUtils.ETFromObj(path) sentDict = {} for sentence in xml.getiterator("sentence"): sentenceText = getText(sentence).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] for event in xml.getiterator("event"): sentenceText = getText(event).strip() if not sentDict.has_key(sentenceText): sentDict[sentenceText] = [] events = sentDict[sentenceText] clue = event.find("clue") clueTuple = getClue(clue) eventType = event.find("type").get("class") if eventType == "Protein_amino_acid_phosphorylation": eventType = "Phosphorylation" if type(clueTuple) == types.StringType: if verbose: print "Event", eventType, "clue with no clueType:", ETUtils.toStr( clue) else: assert sentenceText[clueTuple[1]:clueTuple[2] + 1] == clueTuple[0], ( sentenceText, sentenceText[clueTuple[1]:clueTuple[2] + 1], clueTuple) event = (clueTuple[1], clueTuple[2], eventType, clueTuple[0]) if event not in events: events.append(event) return sentDict
def convert(input, output=None, outputRoot=None): print >> sys.stderr, "##### Convert PMC to Interaction XML #####" print >> sys.stderr, "Loading corpus", input pmcTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" pmcRoot = pmcTree.getroot() includeElements = [ "front", "article-meta", "title-group", "article-title", "abstract", "body", "sec", "p", "title" ] collapseElements = ["front", "article-meta", "title-group", "p"] if outputRoot == None: outputRoot = ET.Element("corpus") outputRoot.set("source", "PMC") outputRoot.append(addElements(pmcRoot, includeElements, collapseElements)) outputTree = ET.ElementTree(outputRoot) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(outputTree, output) return outputTree
def run(cls,inFile,multiplier=1.0,outFile=None,targetLabel="neg", binary=False): """inFile can be a string with file name (.xml or .xml.gz) or an ElementTree or an Element or an open input stream multiplier adjusts the level of boosting the non-negative predictions, it is a real number (0,inf) multiplier 1.0 does nothing, <1.0 decreases negative class confidence, >1.0 increases negative class confidence the root of the modified tree is returned and, if outFile is a string, written out to outFile as well""" print >> sys.stderr, "##### Recall adjust with multiplier " + str(multiplier)[:5] + " #####" tree=ETUtils.ETFromObj(inFile) if not ET.iselement(tree): assert isinstance(tree,ET.ElementTree) root=tree.getroot() else: root = tree if multiplier != -1: if binary: print >> sys.stderr, "Recall binary mode" classRanges = getClassRanges(root.getiterator("entity")) assert len(classRanges.keys()) in [0,2] if len(classRanges.keys()) == 0: print >> sys.stderr, "Warning, recall adjustment skipped because no prediction weights found" else: print >> sys.stderr, "Recall multiclass mode" classRanges = None for entityNode in root.getiterator("entity"): adjustEntity(entityNode,targetLabel,multiplier,classRanges) if outFile: ETUtils.write(root,outFile) return tree
def convert(self, input, dataSetNames=None, corpusName=None, output=None): if os.path.isdir(input) or input.endswith(".tar.gz") or "," in input: print >> sys.stderr, "Converting ST-format to Interaction XML" dataSetDirs = input documents = [] if type(dataSetDirs) in types.StringTypes: dataSetDirs = dataSetDirs.split(",") if dataSetNames == None: dataSetNames = [] elif type(dataSetNames) in types.StringTypes: dataSetNames = dataSetNames.split(",") for dataSetDir, dataSetName in itertools.izip_longest( dataSetDirs, dataSetNames, fillvalue=None): print >> sys.stderr, "Reading", dataSetDir, "set,", docs = STFormat.STTools.loadSet(dataSetDir, dataSetName) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Resolving equivalences" STFormat.Equiv.process(documents) self.xml = STFormat.ConvertXML.toInteractionXML( documents, self.intermediateFileTag, output) else: print >> sys.stderr, "Processing source as interaction XML" self.xml = ETUtils.ETFromObj(input) return self.xml
def processCorpus(input, attrs=["text"]): print attrs print >> sys.stderr, "Loading corpus file", input corpusRoot = ETUtils.ETFromObj(input).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {} interactors = {} for document in documents: entDict = {} for entity in document.getiterator("entity"): entDict[entity.get("id")] = entity for interaction in document.getiterator("interaction"): e1 = entDict[interaction.get("e1")] e2 = entDict[interaction.get("e2")] # form identifier tuples e1Tuple = [] for attr in attrs: e1Tuple.append(e1.get(attr)) e1Tuple = tuple(e1Tuple) e2Tuple = [] for attr in attrs: e2Tuple.append(e2.get(attr)) e2Tuple = tuple(e2Tuple) interactors = [e1Tuple, e2Tuple] #interactors.sort() print interactors
def removeUnconnectedEntities(input, output=None): input = ETUtils.ETFromObj(input) root = input.getroot() removed = 0 preserved = 0 for document in root.findall("document"): sentMap = {} # allow for intersentence interactions for sentence in document.findall("sentence"): sentMap[sentence.get("id")] = sentence connected = set() for interaction in document.getiterator("interaction"): connected.add(interaction.get("e1")) connected.add(interaction.get("e2")) entities = [] for entity in document.getiterator("entity"): entities.append(entity) for entity in entities: if entity.get("isName") == "True": # never remove named entities continue eId = entity.get("id") if eId not in connected: if eId.find(".s") != -1: # sentence level entity sentMap[eId.rsplit(".", 1)[0]].remove(entity) else: # document level entity document.remove(entity) removed += 1 else: preserved += 1 print >> sys.stderr, "Removed", removed, "entities, preserved", preserved, "entities" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(root, output) return input
def makeSubset(filename, output, ratio, seed): if ratio == 1.0: return filename totalFolds = 100 selectedFolds = int(ratio * 100.0) print >> sys.stderr, "====== Making subset ======" print >> sys.stderr, "Subset for file", filename, "ratio", ratio, "seed", seed import cElementTreeUtils as ETUtils import Core.Split xml = ETUtils.ETFromObj(filename).getroot() count = 0 sentCount = 0 for document in xml.findall("document"): sentCount += len(document.findall("sentence")) count += 1 division = Core.Split.getFolds(count, totalFolds, seed) #print division, selectedFolds - 1 index = 0 removeCount = 0 sentRemoveCount = 0 for document in xml.findall("document"): if division[index] > selectedFolds - 1: xml.remove(document) sentRemoveCount += len(document.findall("sentence")) removeCount += 1 index += 1 print "Subset", "doc:", count, "removed:", removeCount, "sent:", sentCount, "sentremoved:", sentRemoveCount ETUtils.write(xml, output) return output
def fixAltOffsets(input, output=None): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() docCount = 0 sentencesCreated = 0 sentences = [x for x in corpusRoot.getiterator("sentence")] counter = ProgressCounter(len(sentences), "FixAltOffsets") fixCount = 0 # fix spans for sentence in sentences: counter.update(1, "Fixing AltOffsets for sentence ("+sentence.get("id")+"): ") sentOffset = Range.charOffsetToSingleTuple(sentence.get("charOffset")) for entity in sentence.findall("entity"): altOffsetString = entity.get("altOffset") if altOffsetString == None: continue #print altOffsetString altOffsets = Range.charOffsetToTuples(altOffsetString) assert len(altOffsets) == 1 for i in range(len(altOffsets)): altOffset = altOffsets[i] altOffsets[i] = (altOffset[0] - sentOffset[0], altOffset[1] - sentOffset[0]) entity.set("altOffset", Range.tuplesToCharOffset(altOffsets)) fixCount += 1 print >> sys.stderr, "Fixed", fixCount, "altOffsets" if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def run(cls, fileIn, fileOut=None, tokenization="split-Charniak-Lease", entityOffsetKey="charOffset", includeNeg=False, stem=False): """Builds the master gazzeteer. fileIn: a string (ending with .xml or .xml.gz), an open input stream, an ElementTree or an Element fileOut: a string or None. If given, the resulting gazzetteer will be written out tokenization: name of the tokenization to be used Produces a dictionary with... """ print >> sys.stderr, "Building gazetteer" gztr = {} #key: token value: dictionary (key: className, value count) root = ETUtils.ETFromObj(fileIn) if not ET.iselement(root): assert isinstance(root, ET.ElementTree) root = root.getroot() sentences = [] for sNode in root.getiterator("sentence"): sentences.append(sNode) counter = ProgressCounter(len(sentences), "Build gazetteer") for sNode in sentences: counter.update( 1, "Adding to gazetteer sentence " + sNode.get("id") + ", ") for tokenizationNode in sNode.getiterator("tokenization"): if tokenizationNode.get("tokenizer") == tokenization: break else: assert False, "Did not find %s tokenization" % tokenization tClasses = tokClasses(tokenizationNode, sNode, entityOffsetKey) assert len(tClasses) == len(tokenizationNode) for tokIdx, tokNode in enumerate(tokenizationNode): gsClass = tClasses[tokIdx] b, e = charOffStr2tuple(tokNode.get("charOffset")) tokNodeTxt = tokTxt(b, e, sNode, stem).lower() tokDict = gztr.setdefault(tokNodeTxt, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # for multi-part texts, add collapsed and last token versions if tokNodeTxt.find("-") != -1: # collapsed text = tokNodeTxt.replace("-", "") if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 # last part text = tokNodeTxt.rsplit("-", 1)[-1] if text != "": tokDict = gztr.setdefault(text, {}) tokDict[gsClass] = tokDict.get(gsClass, 0) + 1 if fileOut: Gazetteer.saveGztr(gztr, fileOut, includeNeg) return gztr
def removeHeads(corpus): print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(corpus) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" return [0, removeCount]
def interface(optionArgs=sys.argv[1:]): """ The function to handle the command-line interface. """ from optparse import OptionParser op = OptionParser(usage="%prog [options]\nGenia shared task specific pruning of invalid nodes and edges.") op.add_option("-i", "--infile", dest="infile", help="Input file (gifxml)", metavar="FILE") op.add_option("-o", "--outfile", dest="outfile", help="Output file (gifxml)", metavar="FILE") op.add_option("-c", "--cycles", dest="cycles", help="Remove cycles (requires the presence of 'predictions' attribute in 'interaction' elements)", default=False, action="store_true") (options, args) = op.parse_args(optionArgs) quit = False if not options.infile: print "Please specify the input file." quit = True # if not options.outfile: # print "Please specify the output file." # quit = True if quit: op.print_help() return(False) corpus = ETUtils.ETFromObj(options.infile) cycleBrokenCount = 0 skipCount = 0 for document in corpus.getroot().findall('document'): for sentence in document.findall("sentence"): #sys.stderr.write("Pruning document %s\n"%document.attrib['id']) pruner = Pruner(sentence) pruner.analyse() if options.cycles: cycleBrokenCount += pruner.analyseCycles() pruner.prune() sys.stderr.write("File pruned, broke " + str(cycleBrokenCount) + " cycles\n" ) if skipCount > 0: sys.stderr.write("Pruning skipped " + str(skipCount) + " sentences\n" ) if options.outfile: ETUtils.write(corpus, options.outfile) return corpus
def negateEvents(input, output=None, verbose=False): if not (ET.iselement(input) and input.tag == "sentence"): print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if not (ET.iselement(input) and input.tag == "sentence"): sentences = corpusRoot.getiterator("sentence") else: sentences = [input] counts = defaultdict(int) for sentence in sentences: for entity in sentence.findall("entity"): counts["all-entities"] += 1 eType = entity.get("type") if not isNegatableEPITrigger(eType): counts["out-of-scope"] += 1 continue eBaseType = getEPIBaseType(eType) eText = entity.get("text").lower() eNewType = determineNewType(eType, eText) # Insert changed charOffset counts["entities"] += 1 if verbose: print "Entity", entity.get("id"), [entity.get("text")], [ eType, eBaseType, eNewType ], if eNewType != eBaseType: counts["negated"] += 1 if verbose: print "NEGATED", if eNewType == eType: counts["correct"] += 1 if verbose: print "CORRECT" else: counts["incorrect"] += 1 if eNewType == eBaseType: counts["incorrect-pos"] += 1 else: counts["incorrect-neg"] += 1 if verbose: print "INCORRECT" entity.set("type", eNewType) if verbose: print counts if not (ET.iselement(input) and input.tag == "sentence"): if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def getEmptyCorpus(xml, deletionRules=None): """ A convenience function for getting an empty corpus, useful for testing for information leaks in the event extraction process. """ if type(xml) in types.StringTypes: # XML is read from disk, so it's a new copy and can be safely modified xml = ETUtils.ETFromObj(xml) else: # XML is already an object in memory. To prevent problems with other users of it, a copy # is created before deleting elements. xml = copy.deepcopy(xml) if deletionRules == None: # use default rules for BioNLP Shared Task # We remove all interactions, and all entities that are not named entities. This leaves only # the gold standard protein/gene names deletionRules = {"interaction": {}, "entity": {"isName": "False"}} # Remove elements and return the emptied XML return processCorpus(xml, None, deletionRules)
def getTriggers(corpus): """ Returns a dictionary of "entity type"->"entity text"->"count" """ corpus = ETUtils.ETFromObj(corpus) trigDict = {} for entity in corpus.getroot().getiterator("entity"): if entity.get("isName") == "True": continue eType = entity.get("type") if not trigDict.has_key(eType): trigDict[eType] = {} eText = entity.get("text") eText = PorterStemmer.stem(eText) if not trigDict[eType].has_key(eText): trigDict[eType][eText] = 0 trigDict[eType][eText] += 1 return trigDict
def loadCorpus(filename, parse=None, tokenization=None, removeIntersentenceInteractions=True, removeNameInfo=False): try: import xml.etree.cElementTree as ET except ImportError: import cElementTree as ET import sys, gzip if type(filename) == types.StringType: print >> sys.stderr, "Loading corpus file", filename corpusTree = ETUtils.ETFromObj(filename) corpusRoot = corpusTree.getroot() return CorpusElements(corpusRoot, parse, tokenization, removeIntersentenceInteractions, corpusTree, removeNameInfo)
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False): print >> sys.stderr, "Loading Static Relations" events = {} for srFile in srFiles: readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText) if xmlFileName != None: xmlEvents = {} dataSets = {} srTexts = {} # original, unnormalized sentence texts from the SR corpus eventsToXML(events, xmlEvents, dataSets, srTexts) print >> sys.stderr, "Loading XML" xml = ETUtils.ETFromObj(xmlFileName) print >> sys.stderr, "Inserting XML events" insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") # update pre-existing parses print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")]) print >> sys.stderr, "Converting back" STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False) STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False) else: xml = eventsToNewXML(events) xmlTree = ET.ElementTree(xml) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") xml = xmlTree # Parse bigfileName = outdir+corpusName print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")
def parseStats(input): print >> sys.stderr, "Loading input file", input inputTree = ETUtils.ETFromObj(input) inputRoot = inputTree.getroot() counts = defaultdict(int) for sentence in inputRoot.getiterator("sentence"): counts["sentence"] += 1 analysesElement = sentence.find("sentenceanalyses") if analysesElement == None: counts["sentence-no-analyses"] += 1 continue # Create parses element (if needed) parsesElement = analysesElement.find("parses") if parsesElement == None: counts["sentence-no-parses"] += 1 continue # Loop through parses for parseElement in parsesElement: parserName = parseElement.get("parser") counts["parse:" + parserName] += 1 if parseElement.get("pennstring") in ["", None]: counts["parse:" + parserName + "(no penn)"] += 1 if len(parseElement.findall("dependency")) == 0: counts["parse:" + parserName + "(no dependencies)"] += 1 if len(parseElement.findall("phrase")) == 0: counts["parse:" + parserName + "(no phrases)"] += 1 # Tokenizations tokenizationsElement = analysesElement.find("tokenizations") if tokenizationsElement == None: counts["sentence-no-tokenizations"] += 1 continue # Loop through tokenizations for tokenizationElement in tokenizationsElement: tokenizerName = tokenizationElement.get("tokenizer") counts["tokenization:" + tokenizerName] += 1 if len(tokenizationElement.findall("token")) == 0: counts["tokenization:" + tokenizerName + "(no tokens)"] += 1 print >> sys.stderr, "Parse statistics for", input for key in sorted(counts.keys()): print >> sys.stderr, " ", key + ":", counts[key]
def findHeads(corpus, stringsFrom, methods, parse, tokenization): for m in methods: assert m in ["REMOVE", "SYNTAX", "DICT"] corpus = ETUtils.ETFromObj(corpus) counts = {} for method in methods: print >> sys.stderr, method, "pass" if method == "REMOVE": counts[method] = removeHeads(corpus) elif method == "DICT": counts[method] = findHeadsDictionary(corpus, stringsFrom, parse, tokenization) elif method == "SYNTAX": counts[method] = findHeadsSyntactic(corpus, parse, tokenization) print >> sys.stderr, method, "pass added", counts[method][ 0], "and removed", counts[method][1], "heads" print >> sys.stderr, "Summary (pass/added/removed):" for method in methods: print >> sys.stderr, " ", method, "/", counts[method][0], "/", counts[ method][1]
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): if document.get("pmid") in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(document.get("pmid")) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def processEquivs(input, output): tree = ETUtils.ETFromObj(input) root = tree.getroot() entitiesById = getElementsById("entity", root) sentencesById = getElementsById("sentence", root) interactions = list(root.getiterator("interaction")) + list( root.getiterator("pair")) # map entities to their incoming interactions interactionsByEntity = {} for interaction in interactions: # Equivs are not interactions that we want to duplicate if interaction.get("type") == "Equiv": continue # outgoing e1 = interaction.get("e1") if not interactionsByEntity.has_key(e1): interactionsByEntity[e1] = [] interactionsByEntity[e1].append(interaction) # incoming e2 = interaction.get("e2") if not interactionsByEntity.has_key(e2): interactionsByEntity[e2] = [] interactionsByEntity[e2].append(interaction) count = 0 for interaction in interactions: print >> sys.stderr, "Processing interaction", count, "out of", len( interactions) if interaction.get("type") == "Equiv": e1 = entitiesById[interaction.get("e1")] e2 = entitiesById[interaction.get("e2")] duplicateFlat(e2, e1, entitiesById, sentencesById, interactionsByEntity) # remove equiv sentenceId = interaction.get("id").rsplit(".", 1)[0] sentencesById[sentenceId].remove(interaction) count += 1 ETUtils.writeUTF8(root, output)
def splitMergedElements(inputFilename, outputFilename=None): print >> sys.stderr, "##### Split elements with merged types #####" print >> sys.stderr, "Loading corpus", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = {"entity": [0, 0], "interaction": [0, 0], "pair": [0, 0]} for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, countsByType) print >> sys.stderr, "Results" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ": removed", countsByType[k][ 0], "created", countsByType[k][1] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def loadDocs(url, outDir, tempDir): inDir = Utils.Download.downloadAndExtract(url, tempDir, outDir + "/DDI11-original")[0] inDir = os.path.join(tempDir, inDir) print "Loading documents from", inDir sentences = {"positive":[], "negative":[]} docCounts = {} docById = {} documents = [] for filename in sorted(os.listdir(inDir)): if filename.endswith(".xml"): print "Reading", filename, xml = ETUtils.ETFromObj(os.path.join(inDir, filename)) for document in xml.getiterator("document"): counts = [0,0] for sentence in document.findall("sentence"): #sentence.set("document.get("origId") + "." + sentence.get("origId")) truePairs = False for pair in sentence.findall("pair"): if pair.get("interaction") == "true": truePairs = True break if truePairs: counts[0] += 1 sentences["positive"].append(sentence) else: counts[1] += 1 sentences["negative"].append(sentence) assert document.get("id") not in docCounts docCounts[document.get("id")] = counts docById[document.get("id")] = document documents.append(document) print counts, #print ETUtils.toStr(document) print print "Positive sentences:", len(sentences["positive"]) print "Negative sentences:", len(sentences["negative"]) return documents, docById, docCounts
def loadDrugBank(filename, preTag="{http://drugbank.ca}", verbose=False): data = defaultdict(lambda : defaultdict(list)) print "Loading DrugBank XML" xml = ETUtils.ETFromObj(filename) print "Processing DrugBank XML" root = xml.getroot() assert root.tag == preTag+"drugs", root.tag for drug in root.findall(preTag+"drug"): id = drug.find(preTag+"drugbank-id").text name = drug.find(preTag+"name").text if verbose: print id, name assert id not in data data[id]["name"] = name # TODO: Enzymes & targets # TODO: hydrophobicity getNestedItems(drug, "synonym", data[id], preTag) getNestedItems(drug, "brand", data[id], preTag) getNestedItems(drug, "group", data[id], preTag) getNestedItems(drug, "category", data[id], preTag, "categories") interactions = drug.find(preTag+"drug-interactions").findall(preTag+"drug-interaction") for interaction in interactions: data[id]["interaction"].append( [interaction.find(preTag+"drug").text, None, interaction.find(preTag+"description").text,] ) return data
def getHeads(corpus): corpus = ETUtils.ETFromObj(corpus) headDict = {} headDict["None"] = {} for sentence in corpus.getiterator("sentence"): headOffsetStrings = set() for entity in sentence.findall("entity"): eType = entity.get("type") if not headDict.has_key(eType): headDict[eType] = {} eText = entity.get("text") headOffset = entity.get("headOffset") headOffsetStrings.add(headOffset) headOffset = Range.charOffsetToSingleTuple(headOffset) charOffset = Range.charOffsetToSingleTuple( entity.get("charOffset")) if headOffset == charOffset: if not headDict[eType].has_key(eText): headDict[eType][eText] = 0 headDict[eType][eText] += 1 else: headText = sentenceText[headOffset[0] - charOffset[0]:headOffset[1] - charOffset[0] + 1] if not headDict[eType].has_key(headText): headDict[eType][headText] = 0 headDict[eType][headText] += 1 for token in tokens: if not token.get( "charOffset" ) in headOffsetStrings: # token is not the head of any entity headText = token.get("text") if not headDict["None"].has_key(headText): headDict["None"][headText] = 0 headDict["None"][headText] += 1 return headDict
def makeDDISubmissionFile(input, output): xml = ETUtils.ETFromObj(input) outFile = open(output, "wt") for sentence in xml.getiterator("sentence"): # First determine which pairs interact intMap = defaultdict(lambda:defaultdict(lambda:None)) for interaction in sentence.findall("interaction"): # Make mapping both ways to discard edge directionality. This isn't actually needed, # since MultiEdgeExampleBuilder builds entity pairs in the same order as this function, # but shouldn't harm to include it and now it works regardless of pair direction. if interaction.get("type") != "neg": intMap[interaction.get("e1")][interaction.get("e2")] = interaction intMap[interaction.get("e2")][interaction.get("e1")] = interaction # Then write all pairs to the output file entities = sentence.findall("entity") for i in range(0, len(entities)-1): for j in range(i+1, len(entities)): eIId = entities[i].get("id") eJId = entities[j].get("id") outFile.write(eIId + "\t" + eJId + "\t") if intMap[eIId][eJId] != None: outFile.write("1\n") else: outFile.write("0\n")
def processCorpus(inputFilename, outputFilename, rules): print >> sys.stderr, "Loading corpus file", inputFilename corpusTree = ETUtils.ETFromObj(inputFilename) corpusRoot = corpusTree.getroot() for eType in rules.keys(): for attrRule in rules[eType].keys(): rules[eType][attrRule] = rules[eType][attrRule].split("|") documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") countsByType = defaultdict(int) for document in documents: counter.update() for sentence in document.findall("sentence"): processSentence(sentence, rules, countsByType) print >> sys.stderr, "Removed" for k in sorted(countsByType.keys()): print >> sys.stderr, " " + k + ":", countsByType[k] if outputFilename != None: print >> sys.stderr, "Writing output to", outputFilename ETUtils.write(corpusRoot, outputFilename) return corpusTree
def process(self, input, outDir, fromStep=None, toStep=None, omitSteps=None): self.initVariables(source=input, xml=input, outDir=outDir) self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps) # Run the tools savedOutput = None # Output from a previous step if "fromStep" is used for step in self.steps: if self.checkStep(step[0]): if savedOutput != None: # A previous run of the program saved an intermediate file self.xml = ETUtils.ETFromObj(savedOutput) savedOutput = None stepArgs = copy.copy( step[2] ) # make a copy of the arguments to which i/o can be added stepArgs[step[4]["input"]] = self.xml # the input if self.getIntermediateFilePath( step ) != None: # this step should save an intermediate file stepArgs[step[4]["output"]] = self.getIntermediateFilePath( step) step[1](**stepArgs) # call the tool elif self.getStepStatus( step[0]) == "BEFORE": # this step was run earlier savedOutput = self.getIntermediateFilePath(step) # End state and return xml = self.xml # state-specific member variable self.xml will be removed when exiting state self.exitState() if self.state == None: # if the whole toolchain has finished, return the final product return xml else: return None
def tokenize(input, output=None, tokenizationName="GeniaTagger-3.0.1", extraFields=[]): #["base", "chunk", "NE"]): global geniaTaggerDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() # Write text to input file workdir = tempfile.mkdtemp() infile = codecs.open(os.path.join(workdir, "tagger-input.txt"), "wt", "utf-8") numCorpusSentences = 0 for sentence in corpusRoot.getiterator("sentence"): infile.write(sentence.get("text") + "\n") numCorpusSentences += 1 infile.close() # Run tagger cwd = os.getcwd() os.chdir(geniaTaggerDir) args = [geniaTaggerDir + "/geniatagger"] #args += [ "<", os.path.join(workdir, "tagger-input.txt")] #args += [ ">", os.path.join(workdir, "tagger-output.txt")] #subprocess.call(args, process = subprocess.Popen( args, stdin=codecs.open(os.path.join(workdir, "tagger-input.txt"), "rt", "utf-8"), stdout=codecs.open(os.path.join(workdir, "tagger-output.txt"), "wt", "utf-8")) waitForProcess(process, numCorpusSentences, True, os.path.join(workdir, "tagger-output.txt"), "GeniaTagger", "Tokenizing Sentences") os.chdir(cwd) # Read tokenization outfile = codecs.open(os.path.join(workdir, "tagger-output.txt"), "rt", "utf-8") # Add output to sentences for sentence in corpusRoot.getiterator("sentence"): # Find or create container elements sentenceAnalyses = sentence.find("sentenceAnalyses") if sentenceAnalyses == None: sentenceAnalyses = ET.Element("sentenceAnalyses") sentence.append(sentenceAnalyses) tokenizations = sentenceAnalyses.find("tokenizations") if tokenizations == None: tokenizations = ET.Element("tokenizations") sentenceAnalyses.append(tokenizations) prevTokenizationIndex = 0 for prevTokenization in tokenizations.findall("tokenization"): assert prevTokenization.get("tokenizer") != tokenizationName prevTokenizationIndex += 1 tokenization = ET.Element("tokenization") tokenization.set("tokenizer", tokenizationName) tokenizations.insert(prevTokenizationIndex, tokenization) sText = sentence.get("text") start = 0 tokenCount = 0 line = outfile.readline() while line.strip() != "": # Add tokens splits = line.strip().split("\t") # Determine offsets cStart = sText.find(splits[0], start) if cStart == -1: if splits[0] == "``": splits[0] = "\"" if splits[0] == "''": splits[0] = "\"" cStart = sText.find(splits[0], start) assert cStart != -1, (sentence.get("id"), sText, line, tokenCount) cEnd = cStart + len(splits[0]) start = cStart + len(splits[0]) # Make element token = ET.Element("token") token.set("id", "gt_" + str(tokenCount + 1)) token.set("text", splits[0]) if "base" in extraFields: token.set("base", splits[1]) token.set("POS", splits[2]) if "chunk" in extraFields: token.set("chunk", splits[3]) if "NE" in extraFields: token.set("NE", splits[4]) token.set("charOffset", str(cStart) + "-" + str(cEnd - 1)) # NOTE: check tokenization.append(token) tokenCount += 1 line = outfile.readline() outfile.close() # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def interface(optionArgs=sys.argv[1:]): """ The function to handle the command-line interface. """ from optparse import OptionParser op = OptionParser( usage="%prog [options]\nGenia shared task specific unflattening.") op.add_option("-i", "--infile", dest="infile", help="Input file (gifxml)", metavar="FILE") op.add_option("-o", "--outfile", dest="outfile", help="Output file (gifxml)", metavar="FILE") op.add_option( "-p", "--perfect", dest="perfect", help="Process only those event which can be perfectly solved", action="store_true", default=False) op.add_option("-a", "--parse", dest="parse", help="Parse to be used", metavar="PARSE") op.add_option("-t", "--tokens", dest="tokens", help="Tokens to be used", metavar="TOKENS") (options, args) = op.parse_args(optionArgs) quit = False if not options.infile: print "Please specify the input file." quit = True # if not options.outfile: # print "Please specify the output file." # quit = True if not options.parse: print "Please specify the parse." quit = True if not options.tokens: print "Please specify the tokenisation." quit = True if quit: op.print_help() return (False) corpus = ETUtils.ETFromObj(options.infile) documents = corpus.getroot().findall('document') counter = ProgressCounter(len(documents), "Unflatten") for document in documents: counter.update(1, "Unflattening (" + document.get("id") + "): ") #sys.stderr.write("Unflattening document %s\n"%document.attrib['id']) unflattener = Unflattener(document, options.perfect, options.tokens, options.parse) #if len(unflattener.tokens) == 0: # continue unflattener.analyse() unflattener.unflatten() #indent(corpus.getroot()) if options.outfile: ETUtils.write(corpus, options.outfile) return corpus