def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, analysisMode="INSERT", debug=False, preprocessorSteps=None, preprocessorParameters=None, logPath=None): global bioNLP13AnalysesTempDir print >> sys.stderr, "==========", "Converting BioNLP Shared Task", corpus, "corpus", "==========" assert analysisMode in ("AUTO", "INSERT", "BUILD", "SKIP") if logPath == "AUTO": if outDir != None: logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" else: logPath = None if logPath: Stream.openLog(logPath) downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) packageSubPath = None if corpus == "BB13T2": packageSubPath = "task_2" elif corpus == "BB13T3": packageSubPath = "task_3" xml = convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, analysisMode=analysisMode, packageSubPath=packageSubPath, debug=debug, preprocessorSteps=preprocessorSteps, preprocessorParameters=preprocessorParameters) if logPath != None: Stream.closeLog(logPath) if bioNLP13AnalysesTempDir != None: shutil.rmtree(bioNLP13AnalysesTempDir) bioNLP13AnalysesTempDir = None return xml
def classify(input, model, output, workDir=None, step=None, omitSteps=None, goldInput=None, detector=None, debug=False, clear=False, preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None): """ Detect events or relations from text. @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name. @param model: A path to a model file or the name of a TEES default model. @param output: The output file stem. Output files will be of the form output-* @param workDir: If intermediate files need to be saved, they will go here. @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param clear: Remove existing workDir @param preprocessorTag: preprocessor output file will be output + preprocessorTag @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model. @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model. """ input = os.path.abspath(input) if goldInput != None: goldInput = os.path.abspath(goldInput) if model != None: model = os.path.abspath(model) # Initialize working directory if workDir != None: # use a permanent work directory workdir(workDir, clear) Stream.openLog(output + "-log.txt") # log in the output directory # Get input files input, preprocess = getInput(input) model = getModel(model) # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["PREPROCESS", "CLASSIFY"]) if not preprocess: selector.markOmitSteps("PREPROCESS") classifyInput = input if selector.check("PREPROCESS"): preprocessor = Preprocessor() preprocessorOutput = output + preprocessorTag #preprocessor.debug = debug #preprocessor.source = input # This has to be defined already here, needs to be fixed later #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities if os.path.exists(preprocessorOutput) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")): #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing." print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing." classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS") else: #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist" print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist" print >> sys.stderr, "------------ Preprocessing ------------" # Remove some of the unnecessary intermediate files #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None}) # Process input into interaction XML classifyInput = preprocessor.process(input, preprocessorOutput, preprocessorParams, model, [], fromStep=detectorSteps["PREPROCESS"], toStep=None, omitSteps=omitDetectorSteps["PREPROCESS"]) if selector.check("CLASSIFY"): detector = getDetector(detector, model)[0]() # initialize detector object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams, model) detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
def workdir(path, deleteIfExists=True, copyFrom=None, log="log.txt"): # When using a template, always remove existing work directory if copyFrom != None: deleteIfExists = True # Remove existing work directory, if requested to do so if os.path.exists(path) and deleteIfExists: print >> sys.stderr, "Output directory exists, removing", path shutil.rmtree(path) # Create work directory if needed if not os.path.exists(path): if copyFrom == None: print >> sys.stderr, "Making output directory", path os.makedirs(path) else: print >> sys.stderr, "Copying template from", options.copyFrom, "to", path shutil.copytree(options.copyFrom, path) else: print >> sys.stderr, "Using existing output directory", path # Remember current directory and switch to workdir atexit.register(os.chdir, os.getcwd()) os.chdir(path) # Open log (if a relative path, it goes under workdir) if log != None: Stream.openLog(log) else: print >> sys.stderr, "No logging" return path
def convert(corpora, outDir=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False, processEquiv=True, addAnalyses=True): global bioNLP13AnalysesTempDir if outDir == None: os.path.normpath(Settings.DATAPATH + "/corpora") if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP Shared Task", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) packageSubPath = None if corpus == "BB13T2": packageSubPath = "task_2" elif corpus == "BB13T3": packageSubPath = "task_3" convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate, processEquiv=processEquiv, addAnalyses=addAnalyses, packageSubPath=packageSubPath) Stream.closeLog(logFileName) count += 1 if bioNLP13AnalysesTempDir != None: shutil.rmtree(bioNLP13AnalysesTempDir) bioNLP13AnalysesTempDir = None
def convertCorpus(corpus, outDir=None, downloadDir=None, redownload=False, removeAnalyses=True, develFraction=0.3, logPath=None): assert corpus in PPI_CORPORA if logPath == "AUTO": logPath = outDir + "/conversion/" + corpus + "-conversion-log.txt" if outDir != None else None if logPath: Stream.openLog(logPath) print >> sys.stderr, "==========", "Converting PPI corpus", corpus, "==========" downloaded = downloadCorpus(corpus, outDir, downloadDir, redownload) print >> sys.stderr, "---------------", "Updating Interaction XML format", "---------------" print >> sys.stderr, "Loading", downloaded[corpus + "_LEARNING_FORMAT"] xml = ETUtils.ETFromObj(downloaded[corpus + "_LEARNING_FORMAT"]) root = xml.getroot() updateXML(root, removeAnalyses) print >> sys.stderr, "---------------", "Adding sets from the PPI evaluation standard", "---------------" addSets(corpus, root, downloaded["PPI_EVALUATION_STANDARD"]) if develFraction > 0.0: print >> sys.stderr, "---------------", "Generating devel set", "---------------" MakeSets.processCorpus(xml, None, "train", [("devel", develFraction), ("train", 1.0)], 1) if outDir != None: print >> sys.stderr, "---------------", "Writing corpus", "---------------" #if intermediateFiles: #print >> sys.stderr, "Writing combined corpus" #ETUtils.write(xml, os.path.join(outDir, corpus + ".xml")) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, corpus, ".xml") if logPath != None: Stream.closeLog(logPath) return xml
def convertDDI13(outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source":"TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource":"TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def beginLog(outDir, logPath="AUTO"): if logPath == "AUTO": logPath = os.path.join(outDir, "log.txt") elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) return logPath
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join( logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [ x.name for x in self.steps ] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False): if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus ("+str(count)+"/"+str(len(corpora))+")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, downloadDir, None, redownload) convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate) Stream.closeLog(logFileName) count += 1
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process(convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def convert(corpora, outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, evaluate=False): if not os.path.exists(outDir): os.makedirs(outDir) else: assert os.path.isdir(outDir) count = 1 for corpus in corpora: print >> sys.stderr, "=======================", "Converting BioNLP'11", corpus, "corpus (" + str( count) + "/" + str(len(corpora)) + ")", "=======================" logFileName = outDir + "/conversion/" + corpus + "-conversion-log.txt" Stream.openLog(logFileName) downloaded = downloadCorpus(corpus, downloadDir, None, redownload) convertDownloaded(outDir, corpus, downloaded, makeIntermediateFiles, evaluate) Stream.closeLog(logFileName) count += 1
def process(self, source, output=None, model=None, fromStep=None, toStep=None, omitSteps=None, logPath=None): if logPath == "AUTO": if output != None: logPath = output if "*" in logPath: logPath = logPath.split("*")[0].rstrip("-") logPath = os.path.join(logPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None elif logPath == "None": logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print >> sys.stderr, "Preprocessor steps:", [x.name for x in self.steps] if len(self.steps) == 0: raise Exception("No preprocessing steps defined") #if omitSteps != None and((type(omitSteps) in types.StringTypes and omitSteps == "CONVERT") or "CONVERT" in omitSteps): # raise Exception("Preprocessor step 'CONVERT' may not be omitted") #if isinstance(source, basestring) and os.path.basename(source).isdigit(): # PMID # print >> sys.stderr, "Preprocessing PubMed abstract", os.path.basename(source) # source = Utils.Download.getPubMed(int(source)) # Initialize variables and save existing default values #self.intermediateFileTag = corpusName #parameters = self.getParameters(parameters, model) #parameters["CONVERT.dataSetNames"] = sourceDataSetNames #parameters["CONVERT.corpusName"] = corpusName #convertSetNames = self.stepArgs("CONVERT")["dataSetNames"] #convertCorpusName = self.stepArgs("CONVERT")["corpusName"] #self.stepArgs("CONVERT")["dataSetNames"] = sourceDataSetNames #self.stepArgs("CONVERT")["corpusName"] = corpusName # Run the tool chain xml = ToolChain.process(self, source, output, model, fromStep, toStep, omitSteps) # Reset variables to saved default values #self.stepArgs("CONVERT")["dataSetNames"] = convertSetNames #self.stepArgs("CONVERT")["corpusName"] = convertCorpusName if logPath != None: Stream.closeLog(logPath) return xml
import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" from optparse import OptionParser optparser = OptionParser(description="A tool chain for making interaction XML, sentence splitting, NER and parsing") optparser.add_option("-i", "--input", default=None, dest="input", help="") optparser.add_option("-n", "--inputNames", default=None, dest="inputNames", help="") optparser.add_option("-c", "--corpus", default=None, dest="corpus", help="corpus name") optparser.add_option("-o", "--output", default=None, dest="output", help="output directory") optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="preprocessing parameters") optparser.add_option("-s", "--step", default=None, dest="step", help="") optparser.add_option("-t", "--toStep", default=None, dest="toStep", help="") optparser.add_option("--omitSteps", default=None, dest="omitSteps", help="") optparser.add_option("--noLog", default=False, action="store_true", dest="noLog", help="") optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="") (options, args) = optparser.parse_args() if options.omitSteps != None: options.omitSteps = options.omitSteps.split(",") if not options.noLog: Stream.openLog(os.path.join(options.output + "-log.txt")) #log(False, True, os.path.join(options.output, options.corpus + "-log.txt")) preprocessor = Preprocessor() preprocessor.setArgForAllSteps("debug", options.debug) preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps)
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join( outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest( *[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len( set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[ x.findall("sentence") for x in (docA, docB, docGold, docTemplate) ]): assert len( set([ x.get("id") for x in (sentA, sentB, sentGold, sentTemplate) ])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt( a, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt( b, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt( template, gold ) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
options.triggerExampleBuilder = "PhraseTriggerExampleBuilder" options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000" options.recallAdjustParams = "0.8,0.9,0.95,1.0" # These commands will be in the beginning of most pipelines WORKDIR=options.output if options.copyFrom != None: if os.path.exists(WORKDIR): shutil.rmtree(WORKDIR) print >> sys.stderr, "Copying template from", options.copyFrom shutil.copytree(options.copyFrom, WORKDIR) workdir(WORKDIR, False) else: workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files if not options.noLog: Stream.openLog("log.txt") #log() # Start logging into a file in working directory ## Make downsampling for learning curve #downSampleTag = "-r" + str(options.downSampleTrain) + "_s" + str(options.downSampleSeed) #newTrainFile = makeSubset(TRAIN_FILE, options.task + "-train-nodup" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed) #makeSubset(TRAIN_FILE.replace("-nodup", ""), options.task + "-train" + options.extraTag + downSampleTag + ".xml", options.downSampleTrain, options.downSampleSeed) #TRAIN_FILE = newTrainFile if subTask != None: print >> sys.stderr, "Task:", options.task + "." + str(subTask) else: print >> sys.stderr, "Task:", options.task eventDetector = EventDetector() eventDetector.debug = options.debug
def classify(input, model, output, workDir=None, step=None, omitSteps=None, goldInput=None, detector=None, debug=False, clear=False, preprocessorTag="-preprocessed.xml.gz", preprocessorParams=None, bioNLPSTParams=None): """ Detect events or relations from text. @param input: The input file in either interaction XML or BioNLP ST format. Can also be a PMID or TEES default corpus name. @param model: A path to a model file or the name of a TEES default model. @param output: The output file stem. Output files will be of the form output-* @param workDir: If intermediate files need to be saved, they will go here. @param step: A step=substep pair, where the steps are PREPROCESS and CLASSIFY @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param goldInput: a version of the corpus file with gold annotation. Enables measuring of performance @param detector: a Detector object, or a string defining one to be imported. If None, will be read from model. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param clear: Remove existing workDir @param preprocessorTag: preprocessor output file will be output + preprocessorTag @param preprocessorParams: Optional parameters controlling preprocessing. If None, will be read from model. @param bioNLPSTParams: Optional parameters controlling BioNLP ST format output. If None, will be read from model. """ input = os.path.abspath(input) if goldInput != None: goldInput = os.path.abspath(goldInput) if model != None: model = os.path.abspath(model) # Initialize working directory if workDir != None: # use a permanent work directory workdir(workDir, clear) Stream.openLog(output + "-log.txt") # log in the output directory # Get input files input, preprocess = getInput(input) model = getModel(model) # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["PREPROCESS", "CLASSIFY"]) if not preprocess: selector.markOmitSteps("PREPROCESS") classifyInput = input if selector.check("PREPROCESS"): if preprocessorParams == None: preprocessorParams = [ "LOAD", "GENIA_SPLITTER", "BANNER", "BLLIP_BIO", "STANFORD_CONVERT", "SPLIT_NAMES", "FIND_HEADS", "SAVE" ] preprocessor = Preprocessor(preprocessorParams) if debug: preprocessor.setArgForAllSteps("debug", True) preprocessorOutput = output + preprocessorTag #preprocessor.debug = debug #preprocessor.source = input # This has to be defined already here, needs to be fixed later #preprocessor.requireEntitiesForParsing = True # parse only sentences which contain named entities if os.path.exists( preprocessorOutput ) and not clear: #os.path.exists(preprocessor.getOutputPath("FIND-HEADS")): #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "exists, skipping preprocessing." print >> sys.stderr, "Preprocessor output", preprocessorOutput, "exists, skipping preprocessing." classifyInput = preprocessorOutput # preprocessor.getOutputPath("FIND-HEADS") else: #print >> sys.stderr, "Preprocessor output", preprocessor.getOutputPath("FIND-HEADS"), "does not exist" print >> sys.stderr, "Preprocessor output", preprocessorOutput, "does not exist" print >> sys.stderr, "------------ Preprocessing ------------" # Remove some of the unnecessary intermediate files #preprocessor.setIntermediateFiles({"Convert":None, "SPLIT-SENTENCES":None, "PARSE":None, "CONVERT-PARSE":None, "SPLIT-NAMES":None}) # Process input into interaction XML classifyInput = preprocessor.process(input, preprocessorOutput, model) if selector.check("CLASSIFY"): detector = getDetector(detector, model)[0]() # initialize detector object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams, model) detector.classify(classifyInput, model, output, goldData=goldInput, fromStep=detectorSteps["CLASSIFY"], omitSteps=omitDetectorSteps["CLASSIFY"], workDir=workDir)
"-c", "--corpora", default="GE", dest="corpora", help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"") optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files") optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to") optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files") optparser.add_option("--forceDownload", default=False, action="store_true", dest="forceDownload", help="re-download all source files") (options, args) = optparser.parse_args() Stream.openLog(os.path.join(options.outdir, "conversion-log.txt")) convert(options.corpora.split(","), options.outdir, options.downloaddir, options.forceDownload, options.intermediateFiles)
def combine(inputA, inputB, inputGold, outPath=None, mode="OR", skip=None, logPath="AUTO"): assert options.mode in ("AND", "OR") if skip != None and isinstance(skip, basestring): skip = set(skip.split(",")) if skip != None: print "Skipping interaction types:", skip if logPath == "AUTO": if outPath != None: logPath = os.path.join(outPath.rstrip("/").rstrip("\\") + "-log.txt") else: logPath = None if logPath != None: if not os.path.exists(os.path.dirname(logPath)): os.makedirs(os.path.dirname(logPath)) Stream.openLog(logPath) print "Loading the Interaction XML files" print "Loading A from", inputA a = ETUtils.ETFromObj(inputA) print "Loading B from", inputB b = ETUtils.ETFromObj(inputB) gold = None if inputGold: print "Loading gold from", inputGold gold = ETUtils.ETFromObj(inputGold) if inputGold else None print "Copying a as template" template = copy.deepcopy(a) print "Calculating confidence score ranges" scoreRanges = {} scoreRanges["a"] = getScoreRange(a, skip) scoreRanges["b"] = getScoreRange(b, skip) print scoreRanges print "Combining" counts = defaultdict(int) counts["skipped"] = defaultdict(int) counter = ProgressCounter(len([x for x in a.findall("document")]), "Combine") for docA, docB, docGold, docTemplate in itertools.izip_longest(*[x.findall("document") for x in (a, b, gold, template)]): counter.update() assert len(set([x.get("id") for x in (docA, docB, docGold, docTemplate)])) == 1 for sentA, sentB, sentGold, sentTemplate in itertools.izip_longest(*[x.findall("sentence") for x in (docA, docB, docGold, docTemplate)]): assert len(set([x.get("id") for x in (sentA, sentB, sentGold, sentTemplate)])) == 1 interactions = getInteractions(sentA, sentB, sentGold, skip, counts["skipped"]) for interaction in sentTemplate.findall("interaction"): sentTemplate.remove(interaction) analyses = sentTemplate.find("analyses") if analyses: sentTemplate.remove(analyses) for key in interactions: interaction = getCombinedInteraction(interactions[key], mode, counts, scoreRanges) if interaction != None: sentTemplate.append(copy.deepcopy(interaction)) if analyses: sentTemplate.append(analyses) counts["skipped"] = dict(counts["skipped"]) print "Counts:", dict(counts) if gold != None: print "****** Evaluating A ******" evaluateChemProt(a, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, a, gold, "McCC") print "****** Evaluating B ******" evaluateChemProt(b, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, b, gold, "McCC") print "****** Evaluating Combined ******" evaluateChemProt(template, gold) #EvaluateIXML.run(AveragingMultiClassEvaluator, template, gold, "McCC") if outPath != None: print "Writing output to", outPath if outPath.endswith(".tsv"): Preprocessor(steps=["EXPORT_CHEMPROT"]).process(template, outPath) else: ETUtils.write(template, outPath) if logPath != None: Stream.closeLog(logPath)
def convertDDI13( outDir, downloadDir=None, datasets=["DDI13_TRAIN", "DDI13_TEST_TASK_9.1", "DDI13_TEST_TASK_9.2"], redownload=False, insertParses=True, parse=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI13-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'13 corpus", "=======================" tempdir = tempfile.mkdtemp() downloaded = downloadFiles(downloadDir, tempdir, redownload) for dataset in datasets: corpusTree = getCorpusXML() xml = corpusTree.getroot() print >> sys.stderr, "Merging input XMLs" assert downloaded[dataset] != None combineXML(xml, "train", downloaded[dataset], subDirs=["DrugBank", "MedLine", "NER"]) print >> sys.stderr, "Processing elements" processElements(xml) if dataset == "DDI13_TRAIN": print >> sys.stderr, "Dividing training set into folds" divideSets(xml, "train", 10) else: for doc in xml.getiterator("document"): doc.set("set", "test") if parse: print >> sys.stderr, "Parsing" parseXML(corpusTree, os.path.join(tempdir, "parsing"), debug) elif insertParses: assert parse == False print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"source": "TEES"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( corpusTree, downloaded[dataset + "_TEES_PARSES"], None, extraAttributes={"stanfordSource": "TEES"}) # Check what was produced by the conversion print >> sys.stderr, "---------------", "Corpus Structure Analysis", "---------------" analyzer = StructureAnalyzer() analyzer.analyze([xml]) print >> sys.stderr, analyzer.toString() if "9.1" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.1.xml") elif "9.2" in dataset: outFileName = os.path.join(outDir, "DDI13-test-task9.2.xml") else: outFileName = os.path.join(outDir, "DDI13-train.xml") print >> sys.stderr, "Writing output to", outFileName ETUtils.write(xml, outFileName) Stream.closeLog(logFileName) if not debug and tempdir != None: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
options.triggerExampleBuilder = "PhraseTriggerExampleBuilder" options.edgeParams = "10,100,1000,5000,7500,10000,20000,25000,28000,50000,60000,65000,100000,500000,1000000" options.recallAdjustParams = "0.8,0.9,0.95,1.0" # These commands will be in the beginning of most pipelines WORKDIR=options.output if options.copyFrom != None: if os.path.exists(WORKDIR): shutil.rmtree(WORKDIR) print >> sys.stderr, "Copying template from", options.copyFrom shutil.copytree(options.copyFrom, WORKDIR) workdir(WORKDIR, False) else: workdir(WORKDIR, options.clearAll) # Select a working directory, optionally remove existing files if not options.noLog: Stream.openLog("log.txt") #log() # Start logging into a file in working directory print >> sys.stderr, "Importing detector", options.detector Detector = eval("from " + options.detector + " import " + options.detector.split(".")[-1]) detector = Detector() detector.debug = options.debug detector.stWriteScores = True # write confidence scores into additional st-format files detector.setConnection(getConnection(options.connection)).debug = options.debug # Pre-calculate all the required SVM models if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if options.singleStage: detector.train(trainFile, develFile, options.develModel, options.testModel,
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in ['DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578']: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir(tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses(xml, extractedFilename, None, extraAttributes={"source":"TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses(xml, extractedFilename, None, extraAttributes={"stanfordSource":"TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
help="") optparser.add_option("--debug", default=False, action="store_true", dest="debug", help="") optparser.add_option("--requireEntities", default=False, action="store_true", dest="requireEntities", help="") (options, args) = optparser.parse_args() if options.omitSteps != None: options.omitSteps = options.omitSteps.split(",") if not options.noLog: Stream.openLog(os.path.join(options.output + "-log.txt")) #log(False, True, os.path.join(options.output, options.corpus + "-log.txt")) preprocessor = Preprocessor() preprocessor.setArgForAllSteps("debug", options.debug) preprocessor.stepArgs("CONVERT")["corpusName"] = options.corpus preprocessor.stepArgs("PARSE")["requireEntities"] = options.requireEntities preprocessor.process(options.input, options.output, options.parameters, None, options.inputNames, fromStep=options.step, toStep=options.toStep, omitSteps=options.omitSteps)
def convert(inPath, outDir, corpusId, directed, negatives, preprocess, preprocessorParameters=None, debug=False, clear=False, constParser="BLLIP-BIO", depParser="STANFORD-CONVERT", logging=True): assert negatives in ("INCLUDE", "SKIP", "REVERSE_POS") # Download the corpus if needed if inPath == None: if not hasattr(Settings, "SE10T8_CORPUS"): SemEval2010Task8Tools.install() inPath = Settings.SE10T8_CORPUS assert os.path.exists(inPath) # Prepare the output directory if not os.path.exists(outDir): print "Making output directory", outDir os.makedirs(outDir) elif clear: print "Removing output directory", outDir shutil.rmtree(outDir) # Start logging if logging: Stream.openLog(os.path.join(outDir, "log.txt"), clear=clear) # Read and process the corpus files archive = zipfile.ZipFile(inPath, 'r') usedIds = set() tree = None for fileName, setName in [("SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT", "train"),\ ("SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT", "test")]: print "Processing file", fileName, "as set", setName f = archive.open(fileName) tree = processLines(f.readlines(), setName, directed=directed, negatives=negatives, usedIds=usedIds, tree=tree, corpusId=corpusId) f.close() # Divide the training set into training and development sets MakeSets.processCorpus(tree, None, "train", [("train", 0.7), ("devel", 1.0)], 1) # Write out the converted corpus convertedPath = os.path.join(outDir, corpusId + "-converted.xml") ETUtils.write(tree.getroot(), convertedPath) # Preprocess the converted corpus if preprocess: outPath = os.path.join(outDir, corpusId + ".xml") preprocessor = Preprocessor(constParser, depParser) preprocessor.setArgForAllSteps("debug", debug) preprocessor.stepArgs("CONVERT")["corpusName"] = corpusId preprocessor.process( convertedPath, outPath, preprocessorParameters, omitSteps=["SPLIT-SENTENCES", "NER", "SPLIT-NAMES"]) # Stop logging if logging: Stream.closeLog(os.path.join(outDir, "log.txt"))
def convertDDI(outDir, trainUnified=None, trainMTMX=None, testUnified=None, testMTMX=None, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() os.chdir(outDir) logFileName = os.path.join(outDir, "DDI-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" bigfileName = os.path.join(outDir, "DDI") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") if trainUnified == None: trainUnified = Settings.URL["DDI_TRAIN_UNIFIED"] if trainMTMX == None: trainMTMX = Settings.URL["DDI_TRAIN_MTMX"] if testUnified == None: testUnified = Settings.URL["DDI_TEST_UNIFIED"] if testMTMX == None: testMTMX = Settings.URL["DDI_TEST_MTMX"] tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir if True: documents, docById, docCounts = loadDocs(trainUnified, outDir, tempdir) sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k,v): (v,k), reverse=True) datasetCounts = {"train":[0,0], "devel":[0,0], "test":[0,0]} for i in range(0, len(sortedDocCounts)-3, 4): for j in [0,1]: docById[sortedDocCounts[i+j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i+j][1][0] datasetCounts["train"][1] += sortedDocCounts[i+j][1][1] docById[sortedDocCounts[i+2][0]].set("set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i+3][0]].set("set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i+2][1][0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i+2][1][1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i+3][1][0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i+3][1][1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") print datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified, tempdir) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DrugDDI") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") #sys.exit() if False: print >> sys.stderr, "Parsing" Tools.CharniakJohnsonParser.parse(xml, bigfileName+"-parsed.xml", tokenizationName=None, parseName="McClosky", requireEntities=True, timeout=10) print >> sys.stderr, "Stanford Conversion" Tools.StanfordParser.convertXML("McClosky", xml, bigfileName+"-stanford.xml") #if True: #xml = bigfileName + "-stanford.xml" print >> sys.stderr, "Protein Name Splitting" splitTarget = "McClosky" xml = ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-McClosky", tokenization=None, output=bigfileName+".xml", removeExisting=True) print >> sys.stderr, "Dividing into sets" InteractionXML.DivideSets.processCorpus(xml, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(oldXML, outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #InteractionXML.DivideSets.processCorpus(bigfileName+".xml", outDir, "DrugDDI-", ".xml", [("devel", "train", "test"), ("devel", "train")]) #if "devel" in [x[0] for x in datasets]: # print >> sys.stderr, "Creating empty devel set" # deletionRules = {"interaction":{},"entity":{"isName":"False"}} # InteractionXML.DeleteElements.processCorpus(corpusName + "-devel.xml", corpusName + "-devel-empty.xml", deletionRules) #return xml Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
Tools.StanfordParser.insertParses(xml, tempdir + "/" + os.path.basename(files[corpus + "_" + setName.upper() + "_McCC"])[:-len(".tar.gz")].split("-", 2)[-1] + "/mccc/sd_ccproc", None, extraAttributes={"stanfordSource":"BioNLP'11"}) print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) def processParses(xml, splitTarget="McCC"): print >> sys.stderr, "Protein Name Splitting" ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) print >> sys.stderr, "Head Detection" xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) if __name__=="__main__": # Import Psyco if available try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" from optparse import OptionParser from Utils.Parameters import * optparser = OptionParser(usage="%prog [options]\nBioNLP'11 Shared Task corpus conversion") optparser.add_option("-c", "--corpora", default="GE", dest="corpora", help="corpus names in a comma-separated list, e.g. \"GE,EPI,ID\"") optparser.add_option("-o", "--outdir", default=os.path.normpath(Settings.DATAPATH + "/corpora"), dest="outdir", help="directory for output files") optparser.add_option("-d", "--downloaddir", default=None, dest="downloaddir", help="directory to download corpus files to") optparser.add_option("--intermediateFiles", default=False, action="store_true", dest="intermediateFiles", help="save intermediate corpus files") optparser.add_option("--forceDownload", default=False, action="store_true", dest="forceDownload", help="re-download all source files") (options, args) = optparser.parse_args() Stream.openLog(os.path.join(options.outdir, "conversion-log.txt")) convert(options.corpora.split(","), options.outdir, options.downloaddir, options.forceDownload, options.intermediateFiles)