def addExamples(exampleFile, predictionFile, classFile, matrix): classSet = IdSet(filename=classFile) f = open(predictionFile, "rt") for example in ExampleUtils.readExamples(exampleFile, False): pred = int(f.readline().split()[0]) predClasses = classSet.getName(pred) goldClasses = classSet.getName(example[1]) for predClass in predClasses.split("---"): for goldClass in goldClasses.split("---"): matrix[predClass][goldClass] matrix[goldClass][predClass] += 1 f.close()
def devectorizePredictions(self, predictions): """ Converts a dense Numpy array of [examples][width][height][features] into the corresponding Python list matrices where features are stored in a key-value dictionary. """ targetIds = IdSet(filename=self.model.get(self.tag+"ids.classes"), locked=True) dimMatrix = int(self.model.getStr("dimMatrix")) dimLabels = int(self.model.getStr("dimLabels")) predictions = reshape(predictions, (predictions.shape[0], dimMatrix, dimMatrix, dimLabels)) rangeMatrix = range(dimMatrix) labels = np.argmax(predictions, axis=-1) values = np.max(predictions, axis=-1) minValue = np.min(values) maxValue = np.max(values) valRange = maxValue - minValue print "MINMAX", minValue, maxValue devectorized = [] for exampleIndex in range(predictions.shape[0]): #print predictions[exampleIndex] devectorized.append([]) for i in rangeMatrix: devectorized[-1].append([]) for j in rangeMatrix: features = {} devectorized[-1][-1].append(features) maxFeature = labels[exampleIndex][i][j] predValue = predictions[exampleIndex][i][j][maxFeature] features[targetIds.getName(maxFeature)] = float(predValue) features["color"] = self.getColor((predValue - minValue) / valRange) return devectorized
def readARFF(filename): featureSet = IdSet(1) classSet = IdSet(0) f = open(filename,"rt") inData = False lines = f.readlines() counter = ProgressCounter(len(lines),"ARFFLine") examples = [] for line in lines: counter.update(string="Processing line " + str(counter.current + 1) + ": ") line = line.strip() if len(line) == 0 or line[0] == "%": continue elif line[0] == "@": #print line category = line.split()[0].lower() if category == "@attribute": category, name, type = line.split() assert(not inData) if name.lower() == "class": name = name.lower() classNames = type[1:-1].split(",") assert(len(classNames)==2) classSet.defineId(classNames[0].strip(),1) classSet.defineId(classNames[1].strip(),-1) featureSet.getId(name) elif category.lower() == "@relation": assert(not inData) elif category == "@data": inData = True else: assert(inData) count = 1 features = {} for column in line.split(","): if featureSet.getName(count) != "class": features[count] = float(column) else: classId = classSet.getId(column, False) assert(classId != None) count += 1 exampleCount = str(len(examples)) exampleId = "BreastCancer.d" + exampleCount + ".s0.x0" examples.append([exampleId,classId,features,{}]) return examples
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None, structureAnalyzer=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": if isinstance(prediction, dict): encoded = prediction["prediction"] predictedModifiers = [ classSet.getName(i) for i in range(len(encoded)) if encoded[i] == 1 ] else: predictedClassName = classSet.getName(prediction[0]) predictedModifiers = "" if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "modConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set( "specConf", self.getPredictionStrengthString( specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set( "negConf", self.getPredictionStrengthString( negMap[eId][1], classSet, classIds, ["", "speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1])) newFeatures = {} for k,v in example[2].iteritems(): newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v example[2] = newFeatures ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None): """ Writes task 3 examples to interaction XML. Assumes task 3 classification is done with SVMMulticlass Classifier, used for two classes. """ print >> sys.stderr, "Adding task 3 to Interaction XML" examples, predictions = self.loadExamples(examples, predictions) if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = None if classSet != None: classIds = classSet.getIds() corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Determine subtask task3Type = None for example in examples: assert example[3].has_key("t3type") task3Type = example[3]["t3type"] break if task3Type == None: if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree assert task3Type in ["multiclass", "speculation", "negation"] # Remove the task 3 subtask information if it already exists for entity in corpusRoot.getiterator("entity"): if task3Type == "multiclass": entity.set("speculation", "False") entity.set("negation", "False") elif task3Type == "speculation": entity.set("speculation", "False") else: # task3Type == "negation" entity.set("negation", "False") specMap = {} negMap = {} for example, prediction in itertools.izip(examples, predictions): assert example[3]["xtype"] == "task3" if example[3]["t3type"] == "multiclass": predictedClassName = classSet.getName(prediction[0]) if predictedClassName != "neg": predictedModifiers = predictedClassName.split("---") if "negation" in predictedModifiers: assert not negMap.has_key(example[3]["entity"]) negMap[example[3]["entity"]] = (True, prediction) if "speculation" in predictedModifiers: assert not specMap.has_key(example[3]["entity"]) specMap[example[3]["entity"]] = (True, prediction) else: if example[3]["t3type"] == "speculation": map = specMap else: map = negMap if prediction[0] != 1: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (True, prediction) else: assert not map.has_key(example[3]["entity"]) map[example[3]["entity"]] = (False, prediction) for entity in corpusRoot.getiterator("entity"): eId = entity.get("id") if task3Type == "multiclass": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set("modPred", self.getPredictionStrengthString(specMap[eId][1], classSet, classIds)) if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set("modPred", self.getPredictionStrengthString(negMap[eId][1], classSet, classIds)) else: if task3Type == "speculation": if specMap.has_key(eId): entity.set("speculation", str(specMap[eId][0])) entity.set("specPred", self.getPredictionStrengthString(specMap[eId][1], classSet, classIds, [""])) elif task3Type == "negation": if negMap.has_key(eId): entity.set("negation", str(negMap[eId][0])) entity.set("negPred", self.getPredictionStrengthString(negMap[eId][1], classSet, classIds, ["","speculation"])) # Write corpus if outputFile != None: print >> sys.stderr, "Writing corpus to", outputFile ETUtils.write(corpusRoot, outputFile) return corpusTree
def threshold(examples, predictionsDir=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = set() if type(examples) == types.StringType: # examples are in file examplesTemp = ExampleUtils.readExamples(examples, False) examples = [] for example in examplesTemp: examples.append(example) classIds.add(example[1]) classIds = list(classIds) classIds.sort() #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet) #multilabel._calculate(examples, predictions) #print multilabel.toStringConcise(title="multilabel") bestThrF = [0] bestBaseF = [0] predFileNames = [] for filename in os.listdir(predictionsDir): if "predictions" in filename: predFileNames.append( (int(filename.rsplit("_")[-1]), filename) ) predFileNames.sort() for predFileName in predFileNames: predictionsTemp = ExampleUtils.loadPredictions(os.path.join(predictionsDir, predFileName[1])) predictions = [] for prediction in predictionsTemp: predictions.append(prediction) baseEv = AveragingMultiClassEvaluator(None, None, classSet) baseEv._calculate(examples, predictions) print "============================" print predFileName[1] print "============================" #print baseEv.toStringConcise(title="baseline") baseLineF = baseEv.microF.fscore for step in [0]: for classId in [1]: #classIds: cls = None if classSet != None: cls = classSet.getName(classId) else: cls = str(classId) bestF = thresholdClass(examples, predictions, classId, baseLineF) for prediction in predictions: prediction[classId] -= bestF[2][0] + 0.00000001 changed = 0 for prediction in predictions: maxVal = -999999 maxClass = None for i in range(1, len(prediction)): if prediction[i] > maxVal: maxVal = prediction[i] maxClass = i if maxClass != prediction[0]: prediction[0] = maxClass changed += 1 print step, cls, "changed", changed, bestF[0] baseLineF = bestF[0] if bestF[0] > bestThrF[0]: bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3]) if baseEv.microF.fscore > bestBaseF[0]: bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise()) print "-------- Baseline ------------" print baseEv.toStringConcise() print "-------- Best ------------" print bestF[0], bestF[1], bestF[2] print bestF[3] thEv = AveragingMultiClassEvaluator(None, None, classSet) thEv._calculate(examples, predictions) print thEv.toStringConcise() print "=============== All Best ===============" print "Threshold", bestThrF print "Base", bestBaseF sys.exit() memPredictions = [] bestEv = baseEv bestPair = [None, None, None] for p in predictions: memPredictions.append(p) for pair in pairs: modifier = pair[0] + 0.00000001 changedClass = 0 for pred in memPredictions: negPred = pred[1] - modifier maxVal = negPred maxClass = 1 for i in range(2, len(pred)): if pred[i] > maxVal: maxVal = pred[i] maxClass = i if pred[0] != maxClass: changedClass += 1 pred[0] = maxClass ev = AveragingMultiClassEvaluator(None) ev._calculate(examples, memPredictions) print pair[0], pair[2], changedClass print ev.toStringConcise() if ev.compare(bestEv) == 1: print "Improved" bestPair = pair bestEv = ev print "---------------------------------------------" print baseEv.toStringConcise(title="baseline") print bestPair[0], bestPair[2] print bestEv.toStringConcise(title="best")
def threshold(examples, predictionsDir=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = set() if type(examples) == types.StringType: # examples are in file examplesTemp = ExampleUtils.readExamples(examples, False) examples = [] for example in examplesTemp: examples.append(example) classIds.add(example[1]) classIds = list(classIds) classIds.sort() #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet) #multilabel._calculate(examples, predictions) #print multilabel.toStringConcise(title="multilabel") bestThrF = [0] bestBaseF = [0] predFileNames = [] for filename in os.listdir(predictionsDir): if "predictions" in filename: predFileNames.append((int(filename.rsplit("_")[-1]), filename)) predFileNames.sort() for predFileName in predFileNames: predictionsTemp = ExampleUtils.loadPredictions( os.path.join(predictionsDir, predFileName[1])) predictions = [] for prediction in predictionsTemp: predictions.append(prediction) baseEv = AveragingMultiClassEvaluator(None, None, classSet) baseEv._calculate(examples, predictions) print "============================" print predFileName[1] print "============================" #print baseEv.toStringConcise(title="baseline") baseLineF = baseEv.microF.fscore for step in [0]: for classId in [1]: #classIds: cls = None if classSet != None: cls = classSet.getName(classId) else: cls = str(classId) bestF = thresholdClass(examples, predictions, classId, baseLineF) for prediction in predictions: prediction[classId] -= bestF[2][0] + 0.00000001 changed = 0 for prediction in predictions: maxVal = -999999 maxClass = None for i in range(1, len(prediction)): if prediction[i] > maxVal: maxVal = prediction[i] maxClass = i if maxClass != prediction[0]: prediction[0] = maxClass changed += 1 print step, cls, "changed", changed, bestF[0] baseLineF = bestF[0] if bestF[0] > bestThrF[0]: bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3]) if baseEv.microF.fscore > bestBaseF[0]: bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise()) print "-------- Baseline ------------" print baseEv.toStringConcise() print "-------- Best ------------" print bestF[0], bestF[1], bestF[2] print bestF[3] thEv = AveragingMultiClassEvaluator(None, None, classSet) thEv._calculate(examples, predictions) print thEv.toStringConcise() print "=============== All Best ===============" print "Threshold", bestThrF print "Base", bestBaseF sys.exit() memPredictions = [] bestEv = baseEv bestPair = [None, None, None] for p in predictions: memPredictions.append(p) for pair in pairs: modifier = pair[0] + 0.00000001 changedClass = 0 for pred in memPredictions: negPred = pred[1] - modifier maxVal = negPred maxClass = 1 for i in range(2, len(pred)): if pred[i] > maxVal: maxVal = pred[i] maxClass = i if pred[0] != maxClass: changedClass += 1 pred[0] = maxClass ev = AveragingMultiClassEvaluator(None) ev._calculate(examples, memPredictions) print pair[0], pair[2], changedClass print ev.toStringConcise() if ev.compare(bestEv) == 1: print "Improved" bestPair = pair bestEv = ev print "---------------------------------------------" print baseEv.toStringConcise(title="baseline") print bestPair[0], bestPair[2] print bestEv.toStringConcise(title="best")
variantExamples = ExampleUtils.readExamples( os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load( os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load( os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load( os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId( variantClassSet.getName(example[1])) newFeatures = {} for k, v in example[2].iteritems(): newFeatures[invariantFeatureSet.getId( variantFeatureSet.getName(k))] = v example[2] = newFeatures ExampleUtils.writeExamples( variantExamples, os.path.join(options.variant, "realignedExamples.txt"))