def processParses(xml, splitTarget="McCC"): print >> sys.stderr, "Protein Name Splitting" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
def processParses(xml, splitTarget="McCC"): print >> sys.stderr, "---------------", "Protein Name Splitting", "---------------" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "---------------", "Head Detection", "---------------" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True)
def convert(datasets, outdir, corpusName): # Depends on CO-conversion bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0]) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Converting to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml") print >> sys.stderr, "Making sentences" xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml") #print >> sys.stderr, "Copying parses" #parsePath = "/home/jari/biotext/BioNLP2011/data/CO/co-devel-and-train-and-test.xml" #InteractionXML.CopyParse.copyParse(bigfileName+"-sentences.xml", parsePath, bigfileName+"-copied-parses.xml", "split-McClosky", "split-McClosky") print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" subprocess.call(splitterCommand, shell=True) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, corpusName + "-", ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction":{},"entity":{"isName":"False"}} InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) return xml
def convert(srFiles, xmlFileName, outdir, corpusName, idByNorText=False): print >> sys.stderr, "Loading Static Relations" events = {} for srFile in srFiles: readEventsFromSR(srFile[0], srFile[1], events, idByNorText=idByNorText) if xmlFileName != None: xmlEvents = {} dataSets = {} srTexts = {} # original, unnormalized sentence texts from the SR corpus eventsToXML(events, xmlEvents, dataSets, srTexts) print >> sys.stderr, "Loading XML" xml = ETUtils.ETFromObj(xmlFileName) print >> sys.stderr, "Inserting XML events" insertEvents(xmlEvents, dataSets, srTexts, xml, corpusName) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") # update pre-existing parses print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-mccc-preparsed", tokenization=None, output=outdir+corpusName+"-heads.xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, corpusName + "-", ".xml", [("devel", "train")]) print >> sys.stderr, "Converting back" STConvert.toSTFormat(outdir+corpusName + "-devel.xml", outDir + corpusName + "-stformat-devel", outputTag="rel", task=2, debug=True, validate=False) STConvert.toSTFormat(outdir+corpusName + "-train.xml", outDir + corpusName + "-stformat-train", outputTag="rel", task=2, debug=True, validate=False) else: xml = eventsToNewXML(events) xmlTree = ET.ElementTree(xml) ETUtils.write(xml, outdir+corpusName+"-srevents.xml") xml = xmlTree # Parse bigfileName = outdir+corpusName print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName="PARSED_TEXT", parseName="McClosky", requireEntities=True, timeout=60) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "SRNE-", ".xml")
def convert(datasets, outdir, corpusName): # Depends on CO-conversion bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0]) print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Converting to", bigfileName + "-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName + "-documents.xml") print >> sys.stderr, "Making sentences" xml = Tools.GeniaSentenceSplitter.makeSentences( xml, bigfileName + "-sentences.xml") #print >> sys.stderr, "Copying parses" #parsePath = "/home/jari/biotext/BioNLP2011/data/CO/co-devel-and-train-and-test.xml" #InteractionXML.CopyParse.copyParse(bigfileName+"-sentences.xml", parsePath, bigfileName+"-copied-parses.xml", "split-McClosky", "split-McClosky") print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(bigfileName + "-sentences.xml", bigfileName + "-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", bigfileName + "-parsed.xml", bigfileName + "-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName + "-stanford.xml" + " -o " + bigfileName + "-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" subprocess.call(splitterCommand, shell=True) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(bigfileName + "-split.xml", "split-McClosky", tokenization=None, output=bigfileName + ".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(bigfileName + ".xml", outDir, corpusName + "-", ".xml", [("devel", "train")]) if "devel" in [x[0] for x in datasets]: print >> sys.stderr, "Creating empty devel set" deletionRules = {"interaction": {}, "entity": {"isName": "False"}} InteractionXML.DeleteElements.processCorpus( corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) return xml
def processParses(corpusName, xml, splitTarget="mccc-preparsed"): if corpusName != "BI": print >> sys.stderr, "Protein Name Splitting" #splitterCommand = "python /home/jari/cvs_checkout/JariSandbox/GeniaChallenge/formatConversion/ProteinNameSplitter.py -f " + bigfileName+"-sentences.xml" + " -o " + bigfileName+"-split.xml" + " -p " + splitTarget + " -t " + splitTarget + " -s split-"+splitTarget + " -n split-"+splitTarget #subprocess.call(splitterCommand, shell=True) ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-" + splitTarget, "split-" + splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-" + splitTarget, tokenization=None, output=None, removeExisting=True) else: ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-" + splitTarget, "split-" + splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-" + splitTarget, tokenization=None, output=None, removeExisting=True)
def convert(datasets, outdir, corpusName): bigfileName = corpusName + "-" + "-and-".join([x[0] for x in datasets]) documents = [] for pair in datasets: print >> sys.stderr, "Reading", pair[0], "set,", docs = ST.loadSet(pair[1], pair[0], "a1") print >> sys.stderr, len(docs), "documents" documents.extend(docs) print >> sys.stderr, "Converting to", bigfileName+"-documents.xml" xml = STConvert.toInteractionXML(documents, corpusName, bigfileName+"-documents.xml") print >> sys.stderr, "Making sentences" xml = Tools.GeniaSentenceSplitter.makeSentences(xml, bigfileName+"-sentences.xml", postProcess=False) print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(bigfileName+"-sentences.xml", bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=False) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", bigfileName+"-parsed.xml", bigfileName+"-stanford.xml") print >> sys.stderr, "Protein Name Splitting" splitterCommand = "python /home/jari/cvs_checkout/PPI_Learning/Analysers/ProteinNameSplitter.py -f " + bigfileName+"-stanford.xml" + " -o " + bigfileName+"-split.xml" + " -p " + "McClosky" + " -t " + "McClosky" + " -s split-McClosky" + " -n split-McClosky" subprocess.call(splitterCommand, shell=True) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(bigfileName+"-split.xml", "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outdir, corpusName + "-", ".xml", [("devel", "train")])
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str(sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml" , verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml" , verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId(sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") trigger.set("isName", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def run(input, output, eventDir, parse="split-mccc-preparsed", verbose=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() counts = defaultdict(int) for document in corpusRoot.findall("document"): sentDict = None pmid = document.get("pmid") isPMC = False for sentence in document.findall("sentence"): counts["sentences"] += 1 sentenceId = str(sentence.get("id")) + "/" + str( sentence.get("origId")) if verbose: print "Processing", sentenceId if sentDict == None: if sentence.get("origId") != None: assert pmid == None sentDict = loadEventXML( eventDir + "/" + sentence.get("origId").split(".")[0] + ".xml", verbose=verbose) else: #pmid = sentence.get("pmid") assert pmid != None if pmid.startswith("PMC"): isPMC = True sentDict = {} else: assert pmid.startswith("PMID") sentDict = loadEventXML( eventDir + "/" + pmid.split("-", 1)[-1] + ".xml", verbose=verbose) interactionXMLText = sentence.get("text") if not sentDict.has_key(interactionXMLText): counts["missing-sentences"] += 1 if isPMC: counts["missing-sentences-PMC"] += 1 if verbose: print "Missing sentence:", pmid, (sentenceId, sentDict, sentence.get("text")) else: sentenceAnalyses = sentence.find("sentenceanalyses") if sentenceAnalyses != None: sentence.remove(sentenceAnalyses) entityIdCount = IDUtils.getNextFreeId( sentence.findall("entity")) events = sentDict[interactionXMLText] events.sort() for event in events: if not keepEvent(event[2]): counts["filtered-triggers"] += 1 continue trigger = ET.Element("entity") trigger.set("isName", "False") trigger.set("charOffset", str(event[0]) + "-" + str(event[1])) trigger.set("type", str(event[2])) trigger.set("text", str(event[3])) trigger.set("source", "GENIA_event_annotation_0.9") trigger.set("id", sentence.get("id") + ".e" + str(entityIdCount)) entityIdCount += 1 counts["added-triggers"] += 1 sentence.append(trigger) if sentenceAnalyses != None: sentence.append(sentenceAnalyses) FindHeads.findHeads(corpusTree, parse, removeExisting=False) removeDuplicates(corpusRoot) print counts if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() os.chdir(outDir) logFileName = os.path.join(outDir, "DDI-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" bigfileName = os.path.join(outDir, "DDI") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") if trainUnified == None: trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"] if trainMTMX == None: trainMTMX = Settings.URL["DDI_TRAIN_MTMX"] if testUnified == None: testUnified = Settings.URL["DDI_TEST_UNIFIED"] if testMTMX == None: testMTMX = Settings.URL["DDI_TEST_MTMX"] tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir if True: documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir) sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") print datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DrugDDI") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") #sys.exit() if False: print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") #if True: #xml = bigfileName + "-stanford.xml" print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #if "devel" in [x[0] for x in datasets]: # print >> sys.stderr, "Creating empty devel set" # deletionRules = {"interaction":{},"entity":{"isName":"False"}} # InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) #return xml Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)