def getDetector(detector, model=None, evaluator=None): # Get the detector if detector == None: assert model != None model = Model(model, "r") detector = model.getStr("detector") model.close() return importClass(detector, "detector")
def getDetector(detector, model=None): # Get the detector if detector == None: assert model != None model = Model(model, "r") detector = model.getStr("detector") model.close() if type(detector) in types.StringTypes: print >> sys.stderr, "Importing detector", detector detectorName = detector if detector.startswith("from"): exec detector detector = eval(detector.split(".")[-1]) else: exec "from " + detector + " import " + detector.split(".")[-1] detector = eval(detector.split(".")[-1]) else: # assume it is a class detectorName = detector.__name__ print >> sys.stderr, "Using detector", detectorName detector = detector return detector, detectorName
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, isSingleStage=False, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param isSingleStage: False for EventDetector, True for a single stage detector. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier """ # Insert default arguments where needed inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None}) models = Parameters.get(models, {"devel":None, "test":None}) exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) if task != None: task = task.replace("-MINI", "").replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isSingleStage: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, { "train": None, "devel": None, "test": None }) models = setDictDefaults(models, {"devel": None, "test": None}) exampleStyles = setDictDefaults( exampleStyles, { "examples": None, "trigger": None, "edge": None, "unmerging": None, "modifiers": None }) classifierParams = setDictDefaults( classifierParams, { "examples": None, "trigger": None, "recall": None, "edge": None, "unmerging": None, "modifiers": None }) subset = setDictDefaults(Parameters.get(subset), { "train": None, "devel": None, "test": None, "seed": 0, "all": None }) folds = setDictDefaults(folds, { "train": None, "devel": None, "test": None }) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters detector, bioNLPSTParams, preprocessorParams = getTaskSettings( task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps( step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector) detector = detector() # initialize object detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams( bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if isinstance(detector, SingleStageDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training") else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training") # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr( "preprocessorParams", Parameters.toString( preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(getEmptyCorpus( inputFiles["devel"], removeNames=("names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]))), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists( inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles[ "test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: Utils.STFormat.Compare.compare( "classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None, processUnmerging=None, processModifiers=None, bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, classifierParams=None, doFullGrid=False, deleteOutput=False, copyFrom=None, log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None): """ Train a new model for event or relation detection. @param output: A directory where output files will appear. @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks. @param detector: a Detector object, or a string defining one to be imported @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test" @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test" @param parse: The parse element name in the training interaction XML @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default. @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default. @param bioNLPSTParams: Parameters controlling BioNLP ST format output. @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying. @param exampleStyles: A parameter set for controlling example builders. @param classifierParams: A parameter set for controlling classifiers. @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search. @param deleteOutput: Remove an existing output directory @param copyFrom: Copy an existing output directory for use as a template @param log: An optional alternative name for the log file. None is for no logging. @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST" @param omitSteps: step=substep parameters, where multiple substeps can be defined. @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved @param connection: A parameter set defining a local or remote connection for training the classifier @param subset: A parameter set for making subsets of input files """ # Insert default arguments where needed inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None}) models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"}) exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None}) classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None}) subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None}) folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None}) processUnmerging = getDefinedBool(processUnmerging) processModifiers = getDefinedBool(processModifiers) # Initialize working directory workdir(output, deleteOutput, copyFrom, log) # Get task specific parameters useKerasDetector = False if detector != None and "keras" in detector.lower(): print >> sys.stderr, "Using a Keras Detector" useKerasDetector = True if detector.lower() == "keras": detector = None detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector) # Learn training settings from input files detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector) # Get corpus subsets getFolds(inputFiles, folds) getSubsets(inputFiles, subset) if task != None: task = task.replace("-FULL", "") if "." in task: _, subTask = getSubTask(task) if subTask != 3: processModifiers = False # Preprocess the corpus if required if corpusPreprocessing != None: preprocessor = Preprocessor(steps=corpusPreprocessing) assert preprocessor.steps[0].name == "MERGE_SETS" assert preprocessor.steps[-1].name == "DIVIDE_SETS" preprocessedCorpusDir = os.path.join(output, "corpus") #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles} preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task)) #inputFiles = outputFiles for setName in inputFiles.keys(): if inputFiles[setName] != None: inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml") # Define processing steps selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"]) # Initialize the detector detector, detectorName = getDetector(detector, evaluator=evaluator) evaluator, evaluatorName = importClass(evaluator, "evaluator") detector = detector() # initialize object if evaluator != None: print >> sys.stderr, "Using evaluator", evaluator.__name__ detector.evaluator = evaluator detector.debug = debug detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams) #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format #detector.stWriteScores = True # write confidence scores into additional st-format files connection = getConnection(connection) detector.setConnection(connection) connection.debug = debug if deleteOutput: connection.clearWorkDir() # Train if selector.check("TRAIN"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------------ Train Detector ------------------" print >> sys.stderr, "----------------------------------------------------" if not isinstance(detector, EventDetector): detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["examples"], classifierParams["examples"], parse, None, task, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) else: detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"], exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"], classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"], classifierParams["recall"], processUnmerging, processModifiers, doFullGrid, task, parse, None, fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"]) # Save the detector type for model in [models["devel"], models["test"]]: if model != None and os.path.exists(model): model = Model(model, "a") model.addStr("detector", detectorName) if evaluatorName != None: model.addStr("detector", evaluatorName) if preprocessorParams != None: preprocessor = Preprocessor() model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams))) model.save() model.close() if selector.check("DEVEL"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Check devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel") if selector.check("EMPTY"): # By passing an emptied devel set through the prediction system, we can check that we get the same predictions # as in the DEVEL step, ensuring the model does not use leaked information. print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------ Empty devel classification ------------" print >> sys.stderr, "----------------------------------------------------" #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files removalScope = "non-given" if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]): removalScope = "all" elif "Edge" in detector.__class__.__name__: removalScope = "interactions" detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty") print >> sys.stderr, "*** Evaluate empty devel classification ***" if os.path.exists("classification-empty/devel-empty-pred.xml.gz"): EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse) else: print >> sys.stderr, "No output file for evaluation" if selector.check("TEST"): print >> sys.stderr, "----------------------------------------------------" print >> sys.stderr, "------------- Test set classification --------------" print >> sys.stderr, "----------------------------------------------------" if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]): print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist" else: #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test") if detector.bioNLPSTParams["convert"]: extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2") # Stop logging if log != None: Stream.closeLog(log)