def train(self, trainData=None, optData=None, model=None, combinedModel=None, exampleStyle=None, classifierParameters=None, parse=None, tokenization=None, task=None, fromStep=None, toStep=None, workDir=None): exampleStyle = Parameters.cat(exampleStyle, "keep_neg:no_features") EdgeDetector.train(self, trainData, optData, model, combinedModel, exampleStyle, classifierParameters, parse, tokenization, fromStep, toStep) self.classify(trainData, model, "classification-train/train", goldData=trainData, workDir="classification-train")
def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted( filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters if detector == "Detectors.EventDetector": classifierParameters["unmerging"] = Parameters.cat( "c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging") classifierParameters["modifiers"] = Parameters.cat( "c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers") classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges") classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers") classifierParameters["recall"] = Parameters.cat( "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges") return detector
def learnSettings(inputFiles, detector, classifierParameters): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters if detector == "Detectors.EventDetector": classifierParameters["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging") classifierParameters["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers") classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges") classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers") classifierParameters["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": classifierParameters["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges") return detector
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join(dataPath, task.replace("-FULL", "") + "-"+dataset+".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) dataPath = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": inputFiles[dataset] = os.path.join( dataPath, task.replace("-FULL", "") + "-" + dataset + ".xml") if task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"), os.path.join(dataPath, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL"]: detector = "Detectors.EventDetector" # BioNLP Shared Task and preprocessing parameters if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13"]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # Preprocessing parameters if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) # Example style parameters for single-stage tasks if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "DDI13": exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId) elif task == "BI11": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Edge style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"]) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"]) elif task == "REL11": exampleStyles["edge"] = Parameters.cat( "rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "CO11": exampleStyles["edge"] = Parameters.cat( "co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat( "bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId) # Trigger style if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat( "genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat( "epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat( "bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat( "rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "build_for_nameless:names" # Classifier parameters if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) return detector, bioNLPSTParams, preprocessorParams
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task subTask = 2 if "." in task: task, subTask = task.split(".") subTask = int(subTask) if corpusDir == None: corpusDir = Settings.CORPUS_DIR for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None and inputFiles[dataset] != "None": if task.startswith("DDI13"): if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join( corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join( corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate( [ os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml") ], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join( corpusDir, task.replace("-FULL", "") + "-" + dataset + ".xml") if inputFiles[dataset] == "None": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists( inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[ dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if task == "CO11": detector = "Detectors.CODetector" elif task in [ "BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL" ]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92"): detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"] ) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in [ "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: bioNLPSTParams = Parameters.cat( bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) ####################################################################### # Preprocessing parameters ####################################################################### if task in [ "BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL" ]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat( "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat( "undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat( "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat( "names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92"): exampleStyles["examples"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat( "bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat( "keep_neg", exampleStyles["examples"], msg ) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat( "sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat( "genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat( "drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat( "keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles[ "trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat( "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat( "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat( "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat( "0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat( "c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat( "c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat( "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13"): folds["devel"] = ["train1", "train2", "train3", "train4"] folds["train"] = ["train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None, useKerasDetector=False): if task != None: print >> sys.stderr, "*** Defining training settings for task", task, "***" fullTaskId = task task, subTask = getSubTask(task) if corpusDir == None: corpusDir = Settings.CORPUS_DIR print >> sys.stderr, "Loading corpus", task, "from", corpusDir for dataset in ["devel", "train", "test"]: if inputFiles[dataset] == None: if task.startswith("DDI13") and task != "DDI13": if dataset in ["devel", "train"]: inputFiles[dataset] = os.path.join(corpusDir, "DDI13-train.xml") elif dataset == "test": if task.endswith("T91"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.1.xml") elif task.endswith("T92") or task.endswith("FULL"): inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.2.xml") elif task == "ID11" and dataset == "train": inputFiles[dataset] = Catenate.catenate([os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"), os.path.join(corpusDir, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True) else: inputFiles[dataset] = os.path.join(corpusDir, task.replace("-FULL", "") + "-"+dataset+".xml") if inputFiles[dataset] == "skip": inputFiles[dataset] = None if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]): fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset]) if os.path.exists(fullPath): inputFiles[dataset] = fullPath else: inputFiles[dataset] = None print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping." assert inputFiles["train"] != None # at least training set must exist # Example generation parameters if detector == None: if task == "CO11": detector = "Detectors.CODetector" elif task in ["BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"]: detector = "Detectors.EventDetector" elif task.startswith("DDI13"): if task.endswith("T91"): detector = "Detectors.EntityDetector" elif task.endswith("T92") or task == "DDI13": detector = "Detectors.EdgeDetector" ####################################################################### # BioNLP Shared Task and preprocessing parameters ####################################################################### if task == "BI11-FULL": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities elif task == "REL11": bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"): bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"]) elif task not in ["DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13", "CP17", "SEMEVAL10T8"]: bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) else: bioNLPSTParams = "skip" ####################################################################### # Preprocessing parameters ####################################################################### if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13"]: Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) else: # parse only sentences where BANNER found an entity Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"]) ####################################################################### # Example style parameters ####################################################################### if not useKerasDetector: # Example style parameters for single-stage tasks ##################### msg = "Single-stage example style / " + fullTaskId if task == "REN11": exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg) elif task == "DDI11": exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task.startswith("DDI13"): if task.endswith("T91"): exampleStyles["examples"] = Parameters.cat("names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg) elif task.endswith("T92") or task == "DDI13": exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg) elif task == "BI11": exampleStyles["examples"] = Parameters.cat("bi_features", exampleStyles["examples"], msg) elif task == "BB_EVENT_16": exampleStyles["examples"] = Parameters.cat("keep_neg", exampleStyles["examples"], msg) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg) elif task == "SDB16": exampleStyles["examples"] = Parameters.cat("sdb_merge:sdb_features", exampleStyles["examples"], msg) # Edge style ########################################################## msg = "Edge example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"], msg) elif task in ["GE09", "GE11", "GE13"]: exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg) elif task == "REL11": exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg) elif task == "DDI11-FULL": exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "DDI13-FULL": exampleStyles["edge"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg) elif task == "CO11": exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg) elif task == "BI11-FULL": exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg) # Trigger style ####################################################### msg = "Trigger example style / " + fullTaskId if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg) elif task in ["EPI11", "PC13"]: exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg) elif task == "BB11": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet" exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg) elif task == "REL11": exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg) elif task in ["BI11-FULL", "DDI11-FULL"]: exampleStyles["trigger"] = "names:build_for_nameless" elif task == "DDI13-FULL": exampleStyles["trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features" elif task == "BB_EVENT_16-FULL": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg) elif task in "BB_EVENT_NER_16": exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg) ####################################################################### # Classifier parameters ####################################################################### if task == "DDI11": classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) #elif task == "DDI13": # classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId) elif task == "CO11": classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId) classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId) elif task == "BB_EVENT_16": classifierParameters["examples"] = Parameters.cat("c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId) elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"): classifierParameters["edge"] = Parameters.cat("c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId) elif task == "SDB16": classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId) # Training fold parameters ############################################ if task.startswith("DDI13") and task != "DDI13": #folds["devel"]=["train1", "train2", "train3", "train4"] #folds["train"]=["train5", "train6", "train7", "train8", "train9"] folds["devel"]=["train1", "train2", "train3"] folds["train"]=["train4", "train5", "train6", "train7", "train8", "train9"] return detector, bioNLPSTParams, preprocessorParams, folds
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False): if detector == None: print >> sys.stderr, "*** Analyzing input files to determine training settings ***" structureAnalyzer = StructureAnalyzer() if not os.path.exists("training/structure.txt"): datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]])) print >> sys.stderr, "input files:", datasets structureAnalyzer.analyze(datasets) print >> sys.stderr, structureAnalyzer.toString() structureAnalyzer.save(None, "training/structure.txt") else: print >> sys.stderr, "Using existing analysis from training/structure.txt" structureAnalyzer.load(None, "training/structure.txt") # Choose detector if detector == None: if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EventDetector" elif "ENTITY" in structureAnalyzer.targets: detector = "Detectors.EntityDetector" elif "INTERACTION" in structureAnalyzer.targets: detector = "Detectors.EdgeDetector" else: assert False, structureAnalyzer.targets if useKerasDetector and not "Keras" in detector: detector = detector.replace("Detectors.", "Detectors.Keras") print >> sys.stderr, "Using detector '" + str(detector) + "'" # Set default parameters cp = classifierParameters if detector == "Detectors.EventDetector": # Add common classifier parameters if cp["examples"] != None: cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"]) cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"]) cp["edge"] = Parameters.cat(cp["examples"], cp["edge"]) cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"]) cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging") cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers") cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges") cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers") cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters") elif detector == "Detectors.EntityDetector": cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities") elif detector == "Detectors.EdgeDetector": cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges") elif detector == "Detectors.UnmergingDetector": cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging") ####################################################################### # Keras example styles ####################################################################### if useKerasDetector: task, subTask = getSubTask(task) msg = "Keras example style" #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles} overrideStyles = {"all":{}} for key in exampleStyles: overrideStyles[key] = {} params = Parameters.get(exampleStyles[key]) if "override" in params: exampleStyles[key] = None overrideStyles[key] = params overrideStyles[key].pop("override") elif "override_all" in params: exampleStyles[key] = None overrideStyles["all"] = params overrideStyles["all"].pop("override_all") #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None print >> sys.stderr, "Override styles:", overrideStyles if "EventDetector" in detector: if task == "EPI11": exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"]) else: exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"]) if task in ["GE09", "GE11", "GE13"] and subTask == 1: exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) else: exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"]) exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"]) exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"]) elif "EntityDetector" in detector: if task == "DDI13T91": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"]) elif "EdgeDetector" in detector: if "DDI" in task: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"]) elif task == "CP17": exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"]) else: exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"]) print >> sys.stderr, "Keras initial example styles:", exampleStyles for key in exampleStyles: if exampleStyles[key] != None: exampleStyles[key] = Parameters.get(exampleStyles[key]) exampleStyles[key].update(overrideStyles[key]) exampleStyles[key].update(overrideStyles["all"]) exampleStyles[key] = Parameters.toString(exampleStyles[key]) print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key] return detector