def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None): classifierModel = model.get(self.tag+"classifier-model", True) shutil.copy2(classifierModelPath, classifierModel) model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters))) if threshold != None: model.addStr(self.tag+"threshold", str(threshold)) return classifierModel
def getConnection(connection): #, account=None, workDirBase=None, remoteSettingsPath=None): if connection == None: # return a "dummy" local connection return getConnection("connection=Unix:jobLimit=1") elif type(connection) in types.StringTypes and hasattr(Settings, connection): # connection is a Settings key print >> sys.stderr, "Using connection", connection return getConnection(getattr(Settings, connection)) #return getConnection(*getattr(Settings, connection)) else: # connection is a parameter string or dictionary defaultParams = dict.fromkeys(["connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug"]) defaultParams["debug"] = False connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug":[bool]}, defaults=defaultParams) if connection["connection"] == None: connection["connection"] = "Unix" if connection["account"] == None: assert connection["workdir"] == None #assert remoteSettingsPath == None print >> sys.stderr, "New local connection", Parameters.toString(connection) else: print >> sys.stderr, "New remote connection:", Parameters.toString(connection) # Make the connection exec "ConnectionClass = " + connection["connection"] + "Connection" connectionArgs = {} for key in connection: if key != "connection" and connection[key] != None: connectionArgs[key] = connection[key] return ConnectionClass(**connectionArgs)
def getSteps(step, omitSteps, mainSteps): # Determine substep to start from, for the main step from which processing starts step = Parameters.get(step, mainSteps) fromMainStep = None fromSubStep = {} # The substep to start from, for the main step to start from for mainStep in step.keys(): fromSubStep[mainStep] = step[mainStep] # the sub step to start from if step[mainStep] != None: assert fromMainStep == None # processing can start from one place only fromMainStep = mainStep if step[mainStep] == True: fromSubStep[mainStep] = None else: assert type(step[mainStep]) in types.StringTypes # no list allowed, processing can start from one place only # Determine steps to omit omitSubSteps = {} # Skip these substeps. If the value is True, skip the entire main step. omitMainSteps = [] omitSteps = Parameters.get(omitSteps, mainSteps) for mainStep in omitSteps.keys(): omitSubSteps[mainStep] = omitSteps[mainStep] if omitSteps[mainStep] == True: omitMainSteps.append(mainStep) omitSubSteps[mainStep] = None # Initialize main step selector if fromMainStep != None: if fromSubStep[fromMainStep] != None: print >> sys.stderr, "Starting process from step", fromMainStep + ", substep", fromSubStep[fromMainStep] else: print >> sys.stderr, "Starting process from step", fromMainStep selector = StepSelector(mainSteps, fromStep=fromMainStep, omitSteps=omitMainSteps) return selector, fromSubStep, omitSubSteps
def saveModel(self, teesModel, tag=""): if hasattr(self, "model") and self.model != None: teesModelPath = teesModel.get(tag+"classifier-model", True) shutil.copy2(self.model, teesModelPath) if hasattr(self, "parameters") and self.parameters != None: teesModel.addStr(tag+"classifier-parameter", Parameters.toString(Parameters.get(self.parameters))) if hasattr(self, "threshold") and self.threshold != None: teesModel.addStr(tag+"threshold", str(self.threshold))
def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) ) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6] evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [evaluation, trained[i], combinations[i], threshold] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
def saveModel(self, teesModel, tag=""): if hasattr(self, "model") and self.model != None: teesModelPath = teesModel.get(tag + "classifier-model", True) shutil.copy2(self.model, teesModelPath) if hasattr(self, "parameters") and self.parameters != None: teesModel.addStr( tag + "classifier-parameter", Parameters.toString(Parameters.get(self.parameters))) if hasattr(self, "threshold") and self.threshold != None: teesModel.addStr(tag + "threshold", str(self.threshold))
def doGrid(self): print >> sys.stderr, "--------- Booster parameter search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: # Parameters to optimize ALL_PARAMS={ "trigger":[int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"]], "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":[int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"]] } else: ALL_PARAMS={"trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter"), valueListKey="c")["c"], "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter"), valueListKey="c")["c"]} paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model-c_") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model-c_") bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["booster"] != params["booster"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples.gz", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM+str(params["trigger"]), recallAdjust=params["booster"]) prevParams = params # Build edge examples self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples.gz"], [self.optData]) # Classify with pre-defined model edgeClassifierModel=EDGE_MODEL_STEM+str(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples.gz", self.workDir+"grid-", classifierModel=edgeClassifierModel) bestResults = self.evaluateGrid(xml, params, bestResults) print >> sys.stderr, "Booster search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def trainUnmergingDetector(self): xml = None if not self.unmerging: print >> sys.stderr, "No unmerging" if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging: # Self-classified train data for unmerging if self.doUnmergingSelfTraining: # This allows limiting to a subcorpus triggerStyle = copy.copy(Parameters.get(self.triggerExampleStyle)) edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle)) unmergingStyle = Parameters.get(self.unmergingExampleStyle) if "sentenceLimit" in unmergingStyle and unmergingStyle["sentenceLimit"]: triggerStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"] edgeStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"] # Build the examples xml = self.triggerDetector.classifyToXML(self.trainData, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=triggerStyle)#, recallAdjust=0.5) xml = self.edgeDetector.classifyToXML(xml, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=edgeStyle)#, recallAdjust=0.5) assert xml != None EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse) else: print >> sys.stderr, "No self-training for unmerging" if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging: # Unmerging example generation GOLD_TEST_FILE = self.optData.replace("-nodup", "") GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "") if self.doUnmergingSelfTraining: if xml == None: xml = self.workDir+"unmerging-extra-edge-pred.xml.gz" self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml]], [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None else: self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "")], [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], [GOLD_TEST_FILE, GOLD_TRAIN_FILE], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True) if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.beginModel(None, self.model, self.workDir+"unmerging-train-examples.gz", self.workDir+"unmerging-opt-examples.gz") if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.endModel(None, self.model, self.workDir+"unmerging-opt-examples.gz") print >> sys.stderr, "Adding unmerging classifier model to test-set event model" if self.combinedModel != None: self.combinedModel.addStr("unmerging-example-style", self.model.getStr("unmerging-example-style")) self.combinedModel.insert(self.model.get("unmerging-ids.classes"), "unmerging-ids.classes") self.combinedModel.insert(self.model.get("unmerging-ids.features"), "unmerging-ids.features") self.unmergingDetector.addClassifierModel(self.combinedModel, self.model.get("unmerging-classifier-model", True), self.model.getStr("unmerging-classifier-parameter")) self.combinedModel.save()
def getClassifier(self, parameters): parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier", "c"], valueListKey="c") if parameters["TEES.classifier"] == None: return self.Classifier else: exec "from Classifiers." + parameters["TEES.classifier"] + " import " + parameters["TEES.classifier"] + " as " + parameters["TEES.classifier"] return eval(parameters["TEES.classifier"])
def train(self, trainData=None, optData=None, model=None, combinedModel=None, exampleStyle=None, classifierParameters=None, parse=None, tokenization=None, task=None, fromStep=None, toStep=None, workDir=None): exampleStyle = Parameters.cat(exampleStyle, "keep_neg:no_features") EdgeDetector.train(self, trainData, optData, model, combinedModel, exampleStyle, classifierParameters, parse, tokenization, fromStep, toStep) self.classify(trainData, model, "classification-train/train", goldData=trainData, workDir="classification-train")
def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] if not os.path.exists(outDir): os.makedirs(outDir) trainFeatures, trainClasses = datasets.load_svmlight_file(examples) if classifyExamples != None: develFeatures, develClasses = datasets.load_svmlight_file(classifyExamples, trainFeatures.shape[1]) binarizer = preprocessing.LabelBinarizer() binarizer.fit(trainClasses) trainClasses = binarizer.transform(trainClasses) if classifyExamples != None: develClasses = binarizer.transform(develClasses) print >> sys.stderr, "Training Keras model with parameters:", parameters parameters = Parameters.get(parameters, {"TEES.classifier":"KerasClassifier", "layers":5, "lr":0.001, "epochs":1, "batch_size":64, "patience":10}) np.random.seed(10) classifier.kerasModel = classifier._defineModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) if exampleFileName == None: exampleFileName = tag+self.tag+"examples" if compressExamples: exampleFileName += ".gz" self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle) if classifierModel == None: classifierModel = model.get(self.tag+"classifier-model") else: assert os.path.exists(classifierModel), classifierModel classifier = self.Classifier() classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True) threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float) predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold) evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style")) if exampleStyle == None: exampleStyle = Parameters.get(model.getStr(self.tag+"example-style")) # no checking, but these should already have passed the ExampleBuilder return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
def getBioNLPSharedTaskParams(self, parameters=None, model=None): if parameters == None: if model != None: model = self.openModel(model, "r") parameters = model.getStr("BioNLPSTParams", defaultIfNotExist=None) else: parameters = {} return Parameters.get(parameters, ["convert", "evaluate", "scores", "a2Tag"])
def getClassifier(self, parameters): #parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier", "c"], valueListKey="c") parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier"], allowNew=True, valueListKey="c") if parameters["TEES.classifier"] == None: return self.Classifier else: exec "from Classifiers." + parameters["TEES.classifier"] + " import " + parameters["TEES.classifier"] + " as " + parameters["TEES.classifier"] return eval(parameters["TEES.classifier"])
def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted( filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters if detector == "Detectors.EventDetector": classifierParameters["unmerging"] = Parameters.cat( "c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging") classifierParameters["modifiers"] = Parameters.cat( "c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers") classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges") classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers") classifierParameters["recall"] = Parameters.cat( "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges") return detector
def getParameters(self, parameters=None, model=None, defaultValue=None, modelParameterStringName=None): if modelParameterStringName == None: modelParameterStringName = self.modelParameterStringName if parameters == None and model != None: model = self.openModel(model, "r") parameters = model.getStr(modelParameterStringName, defaultIfNotExist=None) defaultStepNames = [x[0] for x in self.getDefaultSteps()] valueLimits={"omitSteps":defaultStepNames + [None], "intermediateFiles":defaultStepNames + [True, None]} defaults = self.getDefaultParameters(defaultValue=defaultValue) return Parameters.get(parameters, defaults, valueLimits=valueLimits)
def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) parameters = Parameters.get(parameters, valueListKey="c") trainDir = self.connection.getSetting(self.trainDirSetting) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) trainCommand = os.path.join(trainDir, self.trainCommand) paramKeys = sorted(parameters.keys()) idStr = "" paramString = "" for key in paramKeys: if key.startswith("TEES."): continue if len(paramString) > 0 and not paramString.endswith(" "): paramString += " " if parameters[key] != None: paramString += self.parameterFormat.replace("%k", key).replace("%v", str(parameters[key])).strip() idStr += "-" + str(key) + "_" + str(parameters[key]) else: paramString += self.parameterFormat.replace("%k", key).replace("%v", "").strip() idStr += "-" + str(key) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False) classifyDir = self.connection.getSetting(self.classifyDirSetting) classifyCommand = os.path.join(classifyDir, self.classifyCommand).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0] + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier
def initModel(self, model, saveParams=[]): if model == None: return model elif type(model) in types.StringTypes: model = self.openModel(model, "w") else: assert model.mode in ["a", "w"] for param in saveParams: model.addStr(param[1], Parameters.toString(getattr(self, param[0]))) model.save() return model
def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) #parameters = Parameters.get(parameters, valueListKey="c") trainDir = "" if self.trainDirSetting: trainDir = os.path.normpath(self.connection.getSetting(self.trainDirSetting)) + os.path.sep # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) #trainCommand = os.path.join(trainDir, self.trainCommand) trainCommand = self.trainCommand.replace("%d", trainDir) parameters = Parameters.get(parameters, self.parameterDefaults["train"], self.parameterAllowNew["train"], self.parameterValueListKey["train"], self.parameterValueLimits["train"], self.parameterValueTypes["train"]) paramString, idStr = self._getParameterString(parameters) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False) classifyDir = "" if self.classifyDirSetting: classifyDir = os.path.normpath(self.connection.getSetting(self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace("%d", classifyDir).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0].replace("%d", "") + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier
def getConnection( connection ): #, account=None, workDirBase=None, remoteSettingsPath=None): if connection == None: # return a "dummy" local connection return getConnection("connection=Unix:jobLimit=1") elif type(connection) in types.StringTypes and hasattr( Settings, connection): # connection is a Settings key print >> sys.stderr, "Using connection", connection return getConnection(getattr(Settings, connection)) #return getConnection(*getattr(Settings, connection)) else: # connection is a parameter string or dictionary defaultParams = dict.fromkeys([ "connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug" ]) defaultParams["debug"] = False connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug": [bool]}, defaults=defaultParams) if connection["connection"] == None: connection["connection"] = "Unix" if connection["account"] == None: assert connection["workdir"] == None #assert remoteSettingsPath == None print >> sys.stderr, "New local connection", Parameters.toString( connection) else: print >> sys.stderr, "New remote connection:", Parameters.toString( connection) # Make the connection exec "ConnectionClass = " + connection["connection"] + "Connection" connectionArgs = {} for key in connection: if key != "connection" and connection[key] != None: connectionArgs[key] = connection[key] return ConnectionClass(**connectionArgs)
def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) parameters = Parameters.get(parameters, valueListKey="c") svmMulticlassDir = self.connection.getSetting("SVM_MULTICLASS_DIR") # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters # Train if not os.path.exists(outDir): os.makedirs(outDir) trainCommand = svmMulticlassDir + "/svm_multiclass_learn " paramKeys = sorted(parameters.keys()) idStr = "" for key in paramKeys: trainCommand += "-" + str(key) + " " idStr += "-" + str(key) if parameters[key] != None: trainCommand += str(parameters[key]) + " " idStr += "_" + str(parameters[key]) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand += examples + " " + modelPath self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False) classifyCommand = svmMulticlassDir + "/svm_multiclass_classify " + classifyExamples + " " + modelPath + " " + predictionsPath self.connection.addCommand(classifyCommand) # Run the process jobName = "svm_multiclass_learn" + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) return classifier
def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, dummy=dummy) classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] if not os.path.exists(outDir): os.makedirs(outDir) trainFeatures, trainClasses = datasets.load_svmlight_file(examples) if classifyExamples != None: develFeatures, develClasses = datasets.load_svmlight_file( classifyExamples, trainFeatures.shape[1]) binarizer = preprocessing.LabelBinarizer() binarizer.fit(trainClasses) trainClasses = binarizer.transform(trainClasses) if classifyExamples != None: develClasses = binarizer.transform(develClasses) print >> sys.stderr, "Training Keras model with parameters:", parameters parameters = Parameters.get( parameters, { "TEES.classifier": "KerasClassifier", "layers": 5, "lr": 0.001, "epochs": 1, "batch_size": 64, "patience": 10 }) np.random.seed(10) classifier.kerasModel = classifier._defineModel( outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses) classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
def getBioNLPSharedTaskParams(self, parameters=None, model=None): if parameters == None: if model != None: model = self.openModel(model, "r") parameters = model.getStr("BioNLPSTParams", defaultIfNotExist=None) else: parameters = {} elif parameters == "skip" or "skip" in parameters: parameters = {"convert": False} return Parameters.get( parameters, { "convert": None, "evaluate": None, "scores": None, "a2Tag": None, "evalSubTasks": "123" })
def getParameters(self, parameters=None, model=None, defaultValue=None, modelParameterStringName=None): if modelParameterStringName == None: modelParameterStringName = self.modelParameterStringName if parameters == None and model != None: model = self.openModel(model, "r") parameters = model.getStr(modelParameterStringName, defaultIfNotExist=None) defaultStepNames = [x[0] for x in self.getDefaultSteps()] valueLimits = { "omitSteps": defaultStepNames + [None], "intermediateFiles": defaultStepNames + [True, None] } defaults = self.getDefaultParameters(defaultValue=defaultValue) return Parameters.get(parameters, defaults, valueLimits=valueLimits)
def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None): """ Begin the training process leading to a new model. """ if self.checkStep(step, False): if model != None: if self.state != None and step != None: print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step # Create combined model model = self.openModel(model, "w") assert model.mode in ["a", "w"], (model.path, model.mode) # Information can be imported from an existing model. In this case, model is trained # with the parameter already defined in the import source. This is used when training # the combined model. if importIdsFromModel != None: model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features", "structure.txt"], [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"]) # Train the model with the parameters defined in the import source model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter")) if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0: model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams)) # Catenate example files if type(trainExampleFiles) in types.StringTypes: combinedTrainExamples = trainExampleFiles elif len(trainExampleFiles) == 1: combinedTrainExamples = trainExampleFiles[0] else: combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz" combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb') for trainExampleFile in trainExampleFiles: print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile) combinedTrainExamplesFile.close() # Upload training model # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can # use it, and also as annotation for the trained model. The final selected parameter will # be stored as "*classifier-parameter" classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models" classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameters-train"))(self.connection) classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator) model.save()
def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None): self.initVariables(source=input, xml=input, outDir=os.path.dirname(output)) if os.path.basename(output) != "": self.intermediateFileTag = os.path.basename(output) else: self.intermediateFileTag = "" self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps) parameters = self.getParameters(parameters, model, defaultValue=NOTHING) self.applyParameters(parameters) # Run the tools print >> sys.stderr, "Tool chain parameters:", Parameters.toString(parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters()) if os.path.exists(output) and not os.path.isdir(output): print >> sys.stderr, "Removing existing preprocessor output file", output os.remove(output) savedIntermediate = None # Output from a previous step if "fromStep" is used for step in self.steps: if self.checkStep(step[0]): if savedIntermediate != None: # A previous run of the program saved an intermediate file print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate self.xml = ETUtils.ETFromObj(savedIntermediate) savedIntermediate = None stepArgs = copy.copy(step[2]) # make a copy of the arguments to which i/o can be added stepArgs[step[4]["input"]] = self.xml # the input if self.getIntermediateFilePath(step) != None: # this step should save an intermediate file stepArgs[step[4]["output"]] = self.getIntermediateFilePath(step) print >> sys.stderr, "Running step", step[0], "with arguments", stepArgs step[1](**stepArgs) # call the tool elif self.getStepStatus(step[0]) == "BEFORE": # this step was run earlier savedIntermediate = self.getIntermediateFilePath(step) # End state and return xml = self.xml # state-specific member variable self.xml will be removed when exiting state self.exitState() if self.state == None: # if the whole toolchain has finished, return the final product if not os.path.isdir(output): # if output is a directory, it was given only for storing intermediate files ... ETUtils.write(xml, output) # ... otherwise, save the final output return xml else: return None
def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters if detector == "Detectors.EventDetector": classifierParameters["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging") classifierParameters["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers") classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges") classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers") classifierParameters["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": classifierParameters["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges") return detector
def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None): """ Begin the training process leading to a new model. """ if self.checkStep(step, False): if model != None: if self.state != None and step != None: print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step # Create combined model model = self.openModel(model, "w") assert model.mode in ["a", "w"], (model.path, model.mode) # Information can be imported from an existing model. In this case, model is trained # with the parameter already defined in the import source. This is used when training # the combined model. if importIdsFromModel != None: model.importFrom(self.openModel(importIdsFromModel, "r"), [ self.tag + "ids.classes", self.tag + "ids.features", "structure.txt" ], [ self.tag + "classifier-parameter", self.tag + "example-style", self.tag + "parse", self.tag + "task" ]) # Train the model with the parameters defined in the import source model.addStr( self.tag + "classifier-parameters-train", model.getStr(self.tag + "classifier-parameter")) if self.bioNLPSTParams != None and len( self.bioNLPSTParams) > 0: model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams)) # Catenate example files if type(trainExampleFiles) in types.StringTypes: combinedTrainExamples = trainExampleFiles elif len(trainExampleFiles) == 1: combinedTrainExamples = trainExampleFiles[0] else: combinedTrainExamples = self.workDir + os.path.normpath( model.path) + "-" + self.tag + "combined-examples.gz" combinedTrainExamplesFile = gzip.open( combinedTrainExamples, 'wb') for trainExampleFile in trainExampleFiles: print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile) combinedTrainExamplesFile.close() # Upload training model # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can # use it, and also as annotation for the trained model. The final selected parameter will # be stored as "*classifier-parameter" classifierWorkDir = self.workDir + os.path.normpath( model.path) + "-" + self.tag + "models" classifier = self.getClassifier( model.getStr(self.tag + "classifier-parameters-train"))( self.connection) classifier.optimize( combinedTrainExamples, classifierWorkDir, model.getStr(self.tag + "classifier-parameters-train"), testExampleFile, model.get(self.tag + "ids.classes"), step="SUBMIT", evaluator=self.evaluator) model.save()
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None}) models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"}) exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None}) folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters useKerasDetector = False if detector != None and "keras" in detector.lower(): print >> sys.stderr, "Using a Keras Detector" useKerasDetector = True if detector.lower() == "keras": detector = None detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") if "." in task: _, subTask = getSubTask(task) if subTask != 3: processModifiers = False # Preprocess the corpus if required if corpusPreprocessing != None: preprocessor = Preprocessor(steps=corpusPreprocessing) assert preprocessor.steps[0].name == "MERGE_SETS" assert preprocessor.steps[-1].name == "DIVIDE_SETS" preprocessedCorpusDir = os.path.join(output, "corpus") #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles} preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task)) #inputFiles = outputFiles for setName in inputFiles.keys(): if inputFiles[setName] != None: inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector, evaluator=evaluator) evaluator, evaluatorName = importClass(evaluator, "evaluator") detector = detector() # initialize object if evaluator != None: print >> sys.stderr, "Using evaluator", evaluator.__name__ detector.evaluator = evaluator detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if not isinstance(detector, EventDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if evaluatorName != None: model.addStr("detector", evaluatorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files removalScope = "non-given" if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]): removalScope = "all" elif "Edge" in detector.__class__.__name__: removalScope = "interactions" detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") print >> sys.stderr, "*** Evaluate empty devel classification ***" if os.path.exists("classification-empty/devel-empty-pred.xml.gz"): EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse) else: print >> sys.stderr, "No output file for evaluation" if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2") # Stop logging if log != None: Stream.closeLog(log)
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None, useExistingExamples=False): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) if useExistingExamples: assert exampleFileName != None assert os.path.exists(exampleFileName) if exampleFileName == None: exampleFileName = tag + self.tag + "examples" if compressExamples: exampleFileName += ".gz" if not useExistingExamples: self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle) if classifierModel == None: classifierModel = model.get(self.tag + "classifier-model", defaultIfNotExist=None) #else: # assert os.path.exists(classifierModel), classifierModel classifier = self.getClassifier( model.getStr(self.tag + "classifier-parameter", defaultIfNotExist=None))() classifier.classify(exampleFileName, tag + self.tag + "classifications", classifierModel, finishBeforeReturn=True) threshold = model.getStr(self.tag + "threshold", defaultIfNotExist=None, asType=float) predictions = ExampleUtils.loadPredictions(tag + self.tag + "classifications", recallAdjust, threshold=threshold) evaluator = self.evaluator.evaluate( exampleFileName, predictions, model.get(self.tag + "ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style")) if exampleStyle == None: exampleStyle = Parameters.get( model.getStr(self.tag + "example-style") ) # no checking, but these should already have passed the ExampleBuilder self.structureAnalyzer.load(model) return self.exampleWriter.write( exampleFileName, predictions, data, tag + self.tag + "pred.xml.gz", model.get(self.tag + "ids.classes"), parse, exampleStyle=exampleStyle, structureAnalyzer=self.structureAnalyzer) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory or file") optparser.add_option("-r", "--remote", default=None, dest="remote", help="Remote connection") #optparser.add_option("-c", "--classifier", default="SVMMultiClassClassifier", dest="classifier", help="Classifier Class") optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="Parameters for the classifier") #optparser.add_option("-d", "--ids", default=None, dest="ids", help="") #optparser.add_option("--filterIds", default=None, dest="filterIds", help="") optparser.add_option("--install", default=None, dest="install", help="Install directory (or DEFAULT)") optparser.add_option("--installFromSource", default=False, action="store_true", dest="installFromSource", help="") (options, args) = optparser.parse_args() assert options.action in ["TRAIN", "CLASSIFY", "OPTIMIZE"] classifier = ScikitClassifier(Connection.getConnection(options.remote)) if options.action == "TRAIN": import time trained = classifier.train(options.examples, options.output, options.parameters, options.classifyExamples) status = trained.getStatus() while status not in ["FINISHED", "FAILED"]: print >> sys.stderr, "Training classifier, status =", status time.sleep(10) status = trained.getStatus() print >> sys.stderr, "Training finished, status =", status if trained.getStatus() == "FINISHED": trained.downloadPredictions() trained.downloadModel() elif options.action == "CLASSIFY": classified = classifier.classify(options.examples, options.output, options.model, True) if classified.getStatus() == "FINISHED": classified.downloadPredictions() else: # OPTIMIZE options.parameters = Parameters.get(options.parameters) optimized = classifier.optimize(options.examples, options.output, options.parameters, options.classifyExamples, options.classIds, step=options.optimizeStep)
def doGrid(self): print >> sys.stderr, "--------- Booster parameter search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples.gz"]) if self.fullGrid: # Parameters to optimize ALL_PARAMS = { "trigger": [ int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"] ], "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": [ int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"] ] } else: ALL_PARAMS = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter"), valueListKey="c")["c"], "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter"), valueListKey="c")["c"] } paramCombinations = Parameters.getCombinations( ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model-c_") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model-c_") bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params[ "trigger"] or prevParams["booster"] != params["booster"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples.gz", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + str(params["trigger"]), recallAdjust=params["booster"]) prevParams = params # Build edge examples self.edgeDetector.buildExamples( self.model, [xml], [self.workDir + "grid-edge-examples.gz"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + str(params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples.gz", self.workDir + "grid-", classifierModel=edgeClassifierModel) bestResults = self.evaluateGrid(xml, params, bestResults) print >> sys.stderr, "Booster search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)
def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations( Parameters.get(parameters, valueListKey="c") ) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS"))) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs( [x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate( classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str( bestF)[0:6] evaluation = evaluator.evaluate( classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluation, trained[i], combinations[i], threshold ] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
def parameters(): """This function deals with parameters passed to the script""" # Defines globals to be used above global mode, effectObject, target_packet_type, save_active, NFQUEUE_Active # Defaults mode = print_packet target_packet_type = 'ALL' save_active = False # Setup NFQUEUE_Active = True # Arguments parser = argparse.ArgumentParser( prog="Packet.py", formatter_class=argparse.RawDescriptionHelpFormatter, allow_abbrev=False) parser.add_argument_group('Arguments', description=Parameter.Usage()) # Mode parameters effect = parser.add_mutually_exclusive_group(required=True, ) effect.add_argument('--print', Parameter.cmd_print, action='store_true', dest="output", help=argparse.SUPPRESS) effect.add_argument('--ignore', '-i', action='store_true', dest='ignore', help=argparse.SUPPRESS) effect.add_argument('--latency', Parameter.cmd_latency, action='store', help=argparse.SUPPRESS, type=int) effect.add_argument('--packet-loss', Parameter.cmd_packetloss, action='store', help=argparse.SUPPRESS, type=int) effect.add_argument('--surge', Parameter.cmd_throttle, action='store', help=argparse.SUPPRESS, type=int) effect.add_argument('--display-bandwidth', Parameter.cmd_bandwidth, action='store_true', help=argparse.SUPPRESS) effect.add_argument('--rate-limit', Parameter.cmd_ratelimit, action='store', dest='rate_limit', help=argparse.SUPPRESS, type=int) # Extra parameters parser.add_argument('--target-packet', Parameter.cmd_target_packet, action='store', dest='target', help=argparse.SUPPRESS) parser.add_argument('--save', Parameter.cmd_save, nargs=1, dest='save', help=argparse.SUPPRESS) args = parser.parse_args() # Modes if args.output: effectObject = Print.Print() mode = print_packet elif args.ignore: mode = ignore_packet elif args.latency: effectObject = Latency.Latency(latency_value=args.latency) mode = packet_latency elif args.packet_loss: effectObject = PacketLoss.PacketLoss(percentage=args.packet_loss) mode = packet_loss elif args.surge: effectObject = Surge.Surge(period=args.surge) effectObject.start_purge_monitor() mode = surge elif args.display_bandwidth: effectObject = DisplayBandwidth.DisplayBandwidth() mode = track_bandwidth elif args.rate_limit: # Sets the bandwidth object with the specified bandwidth limit effectObject = LimitBandwidth.LimitBandwidth(bandwidth=args.rate_limit) mode = limit_bandwidth if args.save: print('[!] File saving on - Files will be saved under: \'{}.pcap\''. format(args.save[0])) save_active = True setup_packet_save(args.save[0]) if args.target: target_packet_type = args.target # When all parameters are handled if NFQUEUE_Active: run_packet_manipulation()
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets if useKerasDetector and not "Keras" in detector: detector = detector.replace("Detectors.", "Detectors.Keras") print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters cp = classifierParameters if detector == "Detectors.EventDetector": # Add common classifier parameters if cp["examples"] != None: cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"]) cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"]) cp["edge"] = Parameters.cat(cp["examples"], cp["edge"]) cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"]) cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging") cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers") cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges") cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers") cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges") elif detector == "Detectors.UnmergingDetector": cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging") ####################################################################### # Keras example styles ####################################################################### if useKerasDetector: task, subTask = getSubTask(task) msg = "Keras example style" #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles} overrideStyles = {"all":{}} for key in exampleStyles: overrideStyles[key] = {} params = Parameters.get(exampleStyles[key]) if "override" in params: exampleStyles[key] = None overrideStyles[key] = params overrideStyles[key].pop("override") elif "override_all" in params: exampleStyles[key] = None overrideStyles["all"] = params overrideStyles["all"].pop("override_all") #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None print >> sys.stderr, "Override styles:", overrideStyles if "EventDetector" in detector: if task == "EPI11": exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"]) else: exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"]) if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) else: exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"]) exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"]) elif "EntityDetector" in detector: if task == "DDI13T91": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"]) elif "EdgeDetector" in detector: if "DDI" in task: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"]) elif task == "CP17": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"]) print >> sys.stderr, "Keras initial example styles:", exampleStyles for key in exampleStyles: if exampleStyles[key] != None: exampleStyles[key] = Parameters.get(exampleStyles[key]) exampleStyles[key].update(overrideStyles[key]) exampleStyles[key].update(overrideStyles["all"]) exampleStyles[key] = Parameters.toString(exampleStyles[key]) print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key] return detector
def addClassifierModel(self, model, classifierModelPath, classifierParameters): classifierModel = model.get(self.tag+"classifier-model", True) shutil.copy2(classifierModelPath, classifierModel) model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters))) return classifierModel
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join( dataPath, task.replace("-FULL", "") + "-" + dataset + ".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat( "rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat( "co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat( "genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat( "epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat( "rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) if corpusDir == None: corpusDir = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": if task.startswith("DDI13"): if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join( corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join( corpusDir, task.replace("-FULL", "") + "-" + dataset + ".xml") if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in [ "BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL" ]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92"): detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in [ "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) ####################################################################### # Preprocessing parameters ####################################################################### if task in [ "BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat( "names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92"): exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat( "bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat( "keep_neg", exampleStyles["examples"], msg ) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat( "sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles[ "trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat( "c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat( "c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13"): folds["devel"] = ["train1", "train2", "train3", "train4"] folds["train"] = ["train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples"]) if self.fullGrid: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c") } else: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c") } for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, "Parameters", [ stepParams[x] for x in ["trigger", "booster", "edge"] ] paramCombinations = combine( *[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, "Combinations", paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = { "trigger": paramCombinations[i][0], "booster": paramCombinations[i][1], "edge": paramCombinations[i][2] } #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change) if (prevParams == None) or ( prevParams["trigger"] != params["trigger"]) or ( prevParams["booster"] != params["booster"]): print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"], useExistingExamples=True) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId( params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples", self.workDir + "grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2): os.remove(self.workDir + "grid-" + tag1 + "-" + tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)
def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None): self.initVariables(source=input, xml=input, outDir=os.path.dirname(output)) if os.path.basename(output) != "": self.intermediateFileTag = os.path.basename(output) else: self.intermediateFileTag = "" self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps) parameters = self.getParameters(parameters, model, defaultValue=NOTHING) self.applyParameters(parameters) # Run the tools print >> sys.stderr, "Tool chain parameters:", Parameters.toString( parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters()) if os.path.exists(output) and not os.path.isdir(output): print >> sys.stderr, "Removing existing preprocessor output file", output os.remove(output) savedIntermediate = None # Output from a previous step if "fromStep" is used for step in self.steps: if self.checkStep(step[0]): if savedIntermediate != None: # A previous run of the program saved an intermediate file print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate self.xml = ETUtils.ETFromObj(savedIntermediate) savedIntermediate = None stepArgs = copy.copy( step[2] ) # make a copy of the arguments to which i/o can be added stepArgs[step[4]["input"]] = self.xml # the input if self.getIntermediateFilePath( step ) != None: # this step should save an intermediate file stepArgs[step[4]["output"]] = self.getIntermediateFilePath( step) print >> sys.stderr, "Running step", step[ 0], "with arguments", stepArgs step[1](**stepArgs) # call the tool elif self.getStepStatus( step[0]) == "BEFORE": # this step was run earlier savedIntermediate = self.getIntermediateFilePath(step) # End state and return xml = self.xml # state-specific member variable self.xml will be removed when exiting state self.exitState() if self.state == None: # if the whole toolchain has finished, return the final product if not os.path.isdir( output ): # if output is a directory, it was given only for storing intermediate files ... ETUtils.write(xml, output) # ... otherwise, save the final output return xml else: return None
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")} else: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")} for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]] paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]} #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"]) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2): os.remove(self.workDir+"grid-"+tag1+"-"+tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile( classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) #parameters = Parameters.get(parameters, valueListKey="c") trainDir = os.path.normpath( self.connection.getSetting(self.trainDirSetting)) + os.path.sep # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters classifier._filesToRelease = [examples, classifyExamples] # Train if not os.path.exists(outDir): os.makedirs(outDir) #trainCommand = os.path.join(trainDir, self.trainCommand) trainCommand = self.trainCommand.replace("%d", trainDir) parameters = Parameters.get(parameters, self.parameterDefaults["train"], self.parameterAllowNew["train"], self.parameterValueListKey["train"], self.parameterValueLimits["train"], self.parameterValueTypes["train"]) paramString, idStr = self._getParameterString(parameters) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath( outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand = trainCommand.replace("%p", paramString).replace( "%e", examples).replace("%m", modelPath).strip() self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath( outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath( outDir + "/predictions" + idStr, False) classifyDir = os.path.normpath( self.connection.getSetting( self.classifyDirSetting)) + os.path.sep classifyCommand = self.classifyCommand.replace( "%d", classifyDir).replace("%e", classifyExamples).replace( "%m", modelPath).replace("%c", predictionsPath).strip() self.connection.addCommand(classifyCommand) # Run the process jobName = self.trainCommand.split()[0].replace("%d", "") + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath + ".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) self.getStatus() return classifier
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join(dataPath, task.replace("-FULL", "") + "-"+dataset+".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, isSingleStage=False, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param isSingleStage: False for EventDetector, True for a single stage detector. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier """ # Insert default arguments where needed inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None}) models = Parameters.get(models, {"devel":None, "test":None}) exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) if task != None: task = task.replace("-MINI", "").replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isSingleStage: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def trainUnmergingDetector(self): xml = None if not self.unmerging: print >> sys.stderr, "No unmerging" if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging: # Self-classified train data for unmerging if self.doUnmergingSelfTraining: # This allows limiting to a subcorpus triggerStyle = copy.copy( Parameters.get(self.triggerExampleStyle)) edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle)) unmergingStyle = Parameters.get(self.unmergingExampleStyle) if "sentenceLimit" in unmergingStyle and unmergingStyle[ "sentenceLimit"]: triggerStyle["sentenceLimit"] = unmergingStyle[ "sentenceLimit"] edgeStyle["sentenceLimit"] = unmergingStyle[ "sentenceLimit"] # Build the examples xml = self.triggerDetector.classifyToXML( self.trainData, self.model, None, self.workDir + "unmerging-extra-", exampleStyle=triggerStyle) #, recallAdjust=0.5) xml = self.edgeDetector.classifyToXML( xml, self.model, None, self.workDir + "unmerging-extra-", exampleStyle=edgeStyle) #, recallAdjust=0.5) assert xml != None EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse) else: print >> sys.stderr, "No self-training for unmerging" if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging: # Unmerging example generation GOLD_TEST_FILE = self.optData.replace("-nodup", "") GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "") if self.doUnmergingSelfTraining: if xml == None: xml = self.workDir + "unmerging-extra-edge-pred.xml.gz" self.unmergingDetector.buildExamples( self.model, [ self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml] ], [ self.workDir + "unmerging-opt-examples.gz", self.workDir + "unmerging-train-examples.gz" ], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None else: self.unmergingDetector.buildExamples( self.model, [ self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "") ], [ self.workDir + "unmerging-opt-examples.gz", self.workDir + "unmerging-train-examples.gz" ], [GOLD_TEST_FILE, GOLD_TRAIN_FILE], exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True) xml = None #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True) if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.beginModel( None, self.model, self.workDir + "unmerging-train-examples.gz", self.workDir + "unmerging-opt-examples.gz") if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging: self.unmergingDetector.endModel( None, self.model, self.workDir + "unmerging-opt-examples.gz") print >> sys.stderr, "Adding unmerging classifier model to test-set event model" if self.combinedModel != None: self.combinedModel.addStr( "unmerging-example-style", self.model.getStr("unmerging-example-style")) self.combinedModel.insert( self.model.get("unmerging-ids.classes"), "unmerging-ids.classes") self.combinedModel.insert( self.model.get("unmerging-ids.features"), "unmerging-ids.features") self.unmergingDetector.addClassifierModel( self.combinedModel, self.model.get("unmerging-classifier-model", True), self.model.getStr("unmerging-classifier-parameter")) self.combinedModel.save()
def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False): outDir = os.path.abspath(outDir) examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy) classifyExamples = self.getExampleFile( classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy) parameters = Parameters.get(parameters, valueListKey="c") svmMulticlassDir = self.connection.getSetting("SVM_MULTICLASS_DIR") # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) classifier.setState("TRAIN") classifier.parameters = parameters # Train if not os.path.exists(outDir): os.makedirs(outDir) trainCommand = svmMulticlassDir + "/svm_multiclass_learn " paramKeys = sorted(parameters.keys()) idStr = "" for key in paramKeys: trainCommand += "-" + str(key) + " " idStr += "-" + str(key) if parameters[key] != None: trainCommand += str(parameters[key]) + " " idStr += "_" + str(parameters[key]) classifier.parameterIdStr = idStr classifier.model = self.connection.getRemotePath( outDir + "/model" + idStr, True) modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False) trainCommand += examples + " " + modelPath self.connection.addCommand(trainCommand) # Classify with the trained model (optional) if classifyExamples != None: classifier.predictions = self.connection.getRemotePath( outDir + "/predictions" + idStr, True) predictionsPath = self.connection.getRemotePath( outDir + "/predictions" + idStr, False) classifyCommand = svmMulticlassDir + "/svm_multiclass_classify " + classifyExamples + " " + modelPath + " " + predictionsPath self.connection.addCommand(classifyCommand) # Run the process jobName = "svm_multiclass_learn" + idStr logPath = outDir + "/" + jobName if dummy: # return a classifier that connects to an existing job self.connection.clearCommands() classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName) else: # submit the job classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath + ".stdout") if finishBeforeReturn: self.connection.waitForJob(classifier._job) return classifier
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None, useKerasDetector=False): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task task, subTask = getSubTask(task) if corpusDir == None: corpusDir = Settings.CORPUS_DIR print >> sys.stderr, "Loading corpus", task, "from", corpusDir for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None: if task.startswith("DDI13") and task != "DDI13": if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join(corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join(corpusDir, task.replace("-FULL", "") + "-"+dataset+".xml") if inputFiles[dataset] == "skip": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if detector == None: if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92") or task == "DDI13": detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13", "CP17", "SEMEVAL10T8"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) else: bioNLPSTParams = "skip" ####################################################################### # Preprocessing parameters ####################################################################### if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### if not useKerasDetector: # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat("names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92") or task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat("bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat("keep_neg", exampleStyles["examples"], msg) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat("sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles["trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat("c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat("c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13") and task != "DDI13": #folds["devel"]=["train1", "train2", "train3", "train4"] #folds["train"]=["train5", "train6", "train7", "train8", "train9"] folds["devel"]=["train1", "train2", "train3"] folds["train"]=["train4", "train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, { "train": None, "devel": None, "test": None }) models = setDictDefaults(models, {"devel": None, "test": None}) exampleStyles = setDictDefaults( exampleStyles, { "examples": None, "trigger": None, "edge": None, "unmerging": None, "modifiers": None }) classifierParams = setDictDefaults( classifierParams, { "examples": None, "trigger": None, "recall": None, "edge": None, "unmerging": None, "modifiers": None }) subset = setDictDefaults(Parameters.get(subset), { "train": None, "devel": None, "test": None, "seed": 0, "all": None }) folds = setDictDefaults(folds, { "train": None, "devel": None, "test": None }) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, bioNLPSTParams, preprocessorParams = getTaskSettings( task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isinstance(detector, SingleStageDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr( "preprocessorParams", Parameters.toString( preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(getEmptyCorpus( inputFiles["devel"], removeNames=("names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]))), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists( inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles[ "test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare( "classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")