def __init__(self, style=["typed", "directed"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) if style.find(",") != -1: style = style.split(",") self.styles = style self.negFrac = None self.posPairGaz = POSPairGazetteer() for s in style: if s.find("negFrac") != -1: self.negFrac = float(s.split("_")[-1]) print >> sys.stderr, "Downsampling negatives to", self.negFrac self.negRand = random.Random(15) elif s.find("posPairGaz") != -1: self.posPairGaz = POSPairGazetteer( loadFrom=s.split("_", 1)[-1]) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if "ontology" in self.styles: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if "nodalida" in self.styles: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) #IF LOCAL if "bioinfer_limits" in self.styles: self.bioinferOntologies = OntologyUtils.getBioInferTempOntology() #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) #ENDIF self.pathLengths = length assert (self.pathLengths == None) self.types = types if "random" in self.styles: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): global speculationWords if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() self.specWords, self.specWordStems = readWords(speculationWords) ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: self.gazetteer = None self.styles = self.getParameters(style, { "classification": "multiclass", "speculation_words": True }, {"classification": ("multiclass", "speculation", "negation")})
def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): # reset style regardless of input #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures" if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits", "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"] defaultParameters = {} for name in defaultNone: defaultParameters[name] = None defaultParameters["keep_intersentence"] = False defaultParameters["keep_intersentence_gold"] = True self.styles = self._setDefaultParameters(defaultParameters) self.styles = self.getParameters(style) self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"] self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"] self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"] #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) #if "ontology" in self.styles: # self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert (classSet.getId("neg") == 1) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self._setDefaultParameters([ "rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "build_for_nameless", "pos_only", "all_tokens", "names", "pos_pairs", "linear_ngrams", "phospho" ]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens( ) #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False, errorMatrix=False, verbose=False): print >> sys.stderr, "##### EvaluateInteractionXML #####" print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile # Class sets are used to convert the types to ids that the evaluator can use classSets = {} if EvaluatorClass.type == "binary": classSets["entity"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) classSets["interaction"] = IdSet(idDict={ "True": 1, "False": -1 }, locked=True) negativeClassId = -1 elif EvaluatorClass.type == "multiclass": classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False) classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False) negativeClassId = 1 else: sys.exit("Unknown evaluator type") # Load corpus and make sentence graphs goldCorpusElements = None if goldCorpusFile != None: goldCorpusElements = SentenceGraph.loadCorpus( goldCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) predictedCorpusElements = SentenceGraph.loadCorpus( inputCorpusFile, parse, tokenization, False, removeIntersentenceInteractions) # Compare the corpora and print results on screen return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction, errorMatrix=errorMatrix, verbose=verbose)
def __init__(self, style=None, classSet=None, featureSet=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.timerBuildExamples = Timer(False) self.timerCrawl = Timer(False) self.timerCrawlPrecalc = Timer(False) self.timerMatrix = Timer(False) self.timerMatrixPrecalc = Timer(False)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator( examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def __init__(self, style): ExampleBuilder.__init__(self) self.featureBuilder = EdgeFeatureBuilder(self.featureSet) self.style = style if not "binary" in style: self.classSet = IdSet(1) assert (self.classSet.getId("neg") == 1)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus( SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write( examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids + ".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self.styles = style self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() self.styles = [ "trigger_features", "typed", "directed", "no_linear", "entities", "genia_limits", "noMasking", "maxFeatures" ] self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) if "graph_kernel" in self.styles: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert (classSet.getId("neg") == 1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self._setDefaultParameters(["co_limits"]) self.styles = self.getParameters(style) self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False self.phraseTypeCounts = {}
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self.styles = style self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True): # Class ids #print classIds #print featureIds if classIds != None and os.path.exists(classIds): print >> sys.stderr, "Using predefined class names from", classIds classSet = IdSet(allowNewIds=allowNewIds) classSet.load(classIds) else: print >> sys.stderr, "No predefined class names" classSet = None # Feature ids if featureIds != None and os.path.exists(featureIds): print >> sys.stderr, "Using predefined feature names from", featureIds featureSet = IdSet(allowNewIds=allowNewIds) featureSet.load(featureIds) else: print >> sys.stderr, "No predefined feature names" featureSet = None return classSet, featureSet
def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(0) if featureSet == None: featureSet = IdSet(0) ExampleBuilder.__init__(self, classSet, featureSet) self.featureIds = self.featureSet self.labelIds = self.classSet self._setDefaultParameters([ "directed", "undirected", "cutoff", "annotated_only", "all_positive", "wv", "epochs", "html", "autoencode", "lr", "patience" ]) self.styles = self.getParameters(style) if self.styles["cutoff"]: self.styles["cutoff"] = int(self.styles["cutoff"]) self.wvIndices = None self.embeddingMatrices = None if self.styles.get("wv") != None: indexPath = self.styles.get("wv") + "-indices.json.gz" if not os.path.exists(indexPath): indexPath = os.path.join(Settings.DATAPATH, "wv", indexPath) print >> sys.stderr, "Loading word vector indices from", indexPath with gzip.open(indexPath, "rt") as f: self.wvIndices = json.load(f)["indices"] self.embeddingMatrices = [] self.dimMatrix = 32 self.rangeMatrix = range(self.dimMatrix) self.featureMatrices = [] self.labelMatrices = [] self.tokenLists = []
def addExamples(exampleFile, predictionFile, classFile, matrix): classSet = IdSet(filename=classFile) f = open(predictionFile, "rt") for example in ExampleUtils.readExamples(exampleFile, False): pred = int(f.readline().split()[0]) predClasses = classSet.getName(pred) goldClasses = classSet.getName(example[1]) for predClass in predClasses.split("---"): for goldClass in goldClasses.split("---"): matrix[predClass][goldClass] matrix[goldClass][predClass] += 1 f.close()
def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) #if "noAnnType" in self.styles: self.multiEdgeFeatureBuilder.noAnnType = True #if "noMasking" in self.styles: self.multiEdgeFeatureBuilder.maskNamedEntities = False #if "maxFeatures" in self.styles: self.multiEdgeFeatureBuilder.maximum = True self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = False
def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = style self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) self.counts = {} self.countsPerType = {} self.untypedCounts = {} self.tokenCounts = {}
def __init__(self, classSet=None, featureSet=None): if (type(classSet) == types.StringType): self.classSet = IdSet(filename=classSet) else: self.classSet = classSet if (type(featureSet) == types.StringType): self.featureSet = IdSet(filename=featureSet) else: self.featureSet = featureSet self.featureTag = "" self.exampleStats = ExampleStats() self.parse = None self.tokenization = None #self.idFileTag = None self.classIdFilename = None self.featureIdFilename = None self.styles = {} self._defaultParameters = None self._parameterValueLimits = None self._setDefaultParameters(["sentenceLimit"]) self.debug = False
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) #self.examples = examples #self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization) xml = ix.splitMergedElements(xml, None) xml = ix.recalculateIds(xml, None, True) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2") #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag) corpusElements = None
def getClassSet(rows, classSet=None): from Core.IdSet import IdSet classNames = set() for row in rows: classNames.add(row["class"]) classNames.add(row["prediction"]) # In the case of multiclass, give integer id:s for the classes if classSet == None: classSet = IdSet() assert(not ("1" in classNames and "neg" in classNames)) assert("1" in classNames or "neg" in classNames) if "1" in classNames: classSet.defineId("1",1) else: classSet.defineId("neg",1) for i in sorted(list(classNames)): if i != "1" and i != "neg": classSet.getId(i) return classSet
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None, structureAnalyzer=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": if isinstance(prediction, dict): encoded = prediction["prediction"] predictedModifiers = [ classSet.getName(i) for i in range(len(encoded)) if encoded[i] == 1 ] else: predictedClassName = classSet.getName(prediction[0]) predictedModifiers = "" if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "specConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "negConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds, ["", "speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
def __init__(self, style=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures self._setDefaultParameters([ "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features", "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features", "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only", "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities", "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra", "entity_extra"]) self.styles = self.getParameters(style) #if style == None: # no parameters given # style["typed"] = style["directed"] = style["headsOnly"] = True self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["mask_nodes"]: self.multiEdgeFeatureBuilder.maskNamedEntities = True else: self.multiEdgeFeatureBuilder.maskNamedEntities = False if not self.styles["limit_features"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["noAnnType"]: self.triggerFeatureBuilder.noAnnType = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
def __init__(self): ExampleBuilder.__init__(self) self.classSet = IdSet(1) assert (self.classSet.getId("neg") == 1)