示例#1
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
示例#2
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
示例#3
0
 def devectorizePredictions(self, predictions):
     """
     Converts a dense Numpy array of [examples][width][height][features] into
     the corresponding Python list matrices where features are stored in a key-value
     dictionary.
     """
     targetIds = IdSet(filename=self.model.get(self.tag+"ids.classes"), locked=True)
     dimMatrix = int(self.model.getStr("dimMatrix"))
     dimLabels = int(self.model.getStr("dimLabels"))
     predictions = reshape(predictions, (predictions.shape[0], dimMatrix, dimMatrix, dimLabels))
     rangeMatrix = range(dimMatrix)
     labels = np.argmax(predictions, axis=-1)
     values = np.max(predictions, axis=-1)
     minValue = np.min(values)
     maxValue = np.max(values)
     valRange = maxValue - minValue
     print "MINMAX", minValue, maxValue
     devectorized = []
     for exampleIndex in range(predictions.shape[0]):
         #print predictions[exampleIndex]
         devectorized.append([])
         for i in rangeMatrix:
             devectorized[-1].append([])
             for j in rangeMatrix:
                 features = {}
                 devectorized[-1][-1].append(features)
                 maxFeature = labels[exampleIndex][i][j]
                 predValue = predictions[exampleIndex][i][j][maxFeature]
                 features[targetIds.getName(maxFeature)] = float(predValue)
                 features["color"] = self.getColor((predValue - minValue) / valRange)
     return devectorized
示例#4
0
def readARFF(filename):
    featureSet = IdSet(1)
    classSet = IdSet(0)
    f = open(filename,"rt")
    inData = False
    lines = f.readlines()
    counter = ProgressCounter(len(lines),"ARFFLine")
    examples = []
    for line in lines:
        counter.update(string="Processing line " + str(counter.current + 1) + ": ")
        line = line.strip()
        if len(line) == 0 or line[0] == "%":
            continue
        elif line[0] == "@":
            #print line
            category = line.split()[0].lower()
            if category == "@attribute":
                category, name, type = line.split()
                assert(not inData)
                if name.lower() == "class":
                    name = name.lower()
                    classNames = type[1:-1].split(",")
                    assert(len(classNames)==2)
                    classSet.defineId(classNames[0].strip(),1)
                    classSet.defineId(classNames[1].strip(),-1)
                featureSet.getId(name)
            elif category.lower() == "@relation":
                assert(not inData)
            elif category == "@data":
                inData = True
        else:
            assert(inData)
            count = 1
            features = {}
            for column in line.split(","):
                if featureSet.getName(count) != "class":
                    features[count] = float(column)
                else:
                    classId = classSet.getId(column, False)
                    assert(classId != None)
                count += 1
            exampleCount = str(len(examples))
            exampleId = "BreastCancer.d" + exampleCount + ".s0.x0"
            examples.append([exampleId,classId,features,{}])
                    
    return examples
示例#5
0
    def writeXML(self,
                 examples,
                 predictions,
                 corpus,
                 outputFile,
                 classSet=None,
                 parse=None,
                 tokenization=None,
                 goldCorpus=None,
                 exampleStyle=None,
                 structureAnalyzer=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)

        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()

        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]

        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else:  # task3Type == "negation"
                entity.set("negation", "False")

        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                if isinstance(prediction, dict):
                    encoded = prediction["prediction"]
                    predictedModifiers = [
                        classSet.getName(i) for i in range(len(encoded))
                        if encoded[i] == 1
                    ]
                else:
                    predictedClassName = classSet.getName(prediction[0])
                    predictedModifiers = ""
                    if predictedClassName != "neg":
                        predictedModifiers = predictedClassName.split("---")
                if "negation" in predictedModifiers:
                    assert not negMap.has_key(example[3]["entity"])
                    negMap[example[3]["entity"]] = (True, prediction)
                if "speculation" in predictedModifiers:
                    assert not specMap.has_key(example[3]["entity"])
                    specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)

        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set(
                            "specConf",
                            self.getPredictionStrengthString(
                                specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set(
                            "negConf",
                            self.getPredictionStrengthString(
                                negMap[eId][1], classSet, classIds,
                                ["", "speculation"]))

        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
示例#6
0
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
        newFeatures = {}
        for k,v in example[2].iteritems():
            newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v
        example[2] = newFeatures
        
    ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
示例#7
0
    def writeXML(self, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, exampleStyle=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)
        
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()
        
        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break        
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]
        
        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else: # task3Type == "negation"
                entity.set("negation", "False")
        
        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                predictedClassName = classSet.getName(prediction[0])
                if predictedClassName != "neg":
                    predictedModifiers = predictedClassName.split("---")
                    if "negation" in predictedModifiers:
                        assert not negMap.has_key(example[3]["entity"])
                        negMap[example[3]["entity"]] = (True, prediction)
                    if "speculation" in predictedModifiers:
                        assert not specMap.has_key(example[3]["entity"])
                        specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)
        
        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set("modPred", self.getPredictionStrengthString(specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set("modPred", self.getPredictionStrengthString(negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set("specPred", self.getPredictionStrengthString(specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set("negPred", self.getPredictionStrengthString(negMap[eId][1], classSet, classIds, ["","speculation"]))
        
        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
示例#8
0
def threshold(examples, predictionsDir=None, classSet=None):
    if type(classSet) == types.StringType: # class names are in file
        classSet = IdSet(filename=classSet)
    classIds = set()
    if type(examples) == types.StringType: # examples are in file
        examplesTemp = ExampleUtils.readExamples(examples, False)
        examples = []
        for example in examplesTemp:
            examples.append(example)
            classIds.add(example[1])
    classIds = list(classIds)
    classIds.sort()
    
    #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet)
    #multilabel._calculate(examples, predictions)
    #print multilabel.toStringConcise(title="multilabel")
    
    bestThrF = [0]
    bestBaseF = [0]
    predFileNames = []
    for filename in os.listdir(predictionsDir):
        if "predictions" in filename:
            predFileNames.append( (int(filename.rsplit("_")[-1]), filename) )
    predFileNames.sort()
    for predFileName in predFileNames:
        predictionsTemp = ExampleUtils.loadPredictions(os.path.join(predictionsDir, predFileName[1]))
        predictions = []
        for prediction in predictionsTemp:
            predictions.append(prediction)
    
        baseEv = AveragingMultiClassEvaluator(None, None, classSet)
        baseEv._calculate(examples, predictions)
        print "============================"
        print predFileName[1]
        print "============================"
        #print baseEv.toStringConcise(title="baseline")
        
        baseLineF = baseEv.microF.fscore
        for step in [0]:
            for classId in [1]: #classIds:
                cls = None
                if classSet != None:
                    cls = classSet.getName(classId)
                else:
                    cls = str(classId)
                bestF = thresholdClass(examples, predictions, classId, baseLineF)
                for prediction in predictions:
                    prediction[classId] -= bestF[2][0] + 0.00000001
                changed = 0
                for prediction in predictions:
                    maxVal = -999999
                    maxClass = None
                    for i in range(1, len(prediction)):
                        if prediction[i] > maxVal:
                            maxVal = prediction[i]
                            maxClass = i
                    if maxClass != prediction[0]:
                        prediction[0] = maxClass
                        changed += 1
                print step, cls, "changed", changed, bestF[0]
                baseLineF = bestF[0]
        
        if bestF[0] > bestThrF[0]:
            bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3])
        if baseEv.microF.fscore > bestBaseF[0]:
            bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise())
    
        print "-------- Baseline ------------"
        print baseEv.toStringConcise()
        print "-------- Best ------------"
        print bestF[0], bestF[1], bestF[2]
        print bestF[3]
        thEv = AveragingMultiClassEvaluator(None, None, classSet)
        thEv._calculate(examples, predictions)
        print thEv.toStringConcise()
    
    print "=============== All Best ==============="
    print "Threshold", bestThrF
    print "Base", bestBaseF
    sys.exit()
    
    memPredictions = []
    bestEv = baseEv
    bestPair = [None, None, None]
    for p in predictions:
        memPredictions.append(p)
    for pair in pairs:
        modifier = pair[0] + 0.00000001
        changedClass = 0
        for pred in memPredictions:
            negPred = pred[1] - modifier  
            maxVal = negPred
            maxClass = 1
            for i in range(2, len(pred)):
                if pred[i] > maxVal:
                    maxVal = pred[i]
                    maxClass = i
            if pred[0] != maxClass:
                changedClass += 1
            pred[0] = maxClass
        ev = AveragingMultiClassEvaluator(None)
        ev._calculate(examples, memPredictions)
        print pair[0], pair[2], changedClass
        print ev.toStringConcise()
        if ev.compare(bestEv) == 1:
            print "Improved"
            bestPair = pair
            bestEv = ev
    
    print "---------------------------------------------"
    print baseEv.toStringConcise(title="baseline")
    print bestPair[0], bestPair[2] 
    print bestEv.toStringConcise(title="best") 
示例#9
0
def threshold(examples, predictionsDir=None, classSet=None):
    if type(classSet) == types.StringType:  # class names are in file
        classSet = IdSet(filename=classSet)
    classIds = set()
    if type(examples) == types.StringType:  # examples are in file
        examplesTemp = ExampleUtils.readExamples(examples, False)
        examples = []
        for example in examplesTemp:
            examples.append(example)
            classIds.add(example[1])
    classIds = list(classIds)
    classIds.sort()

    #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet)
    #multilabel._calculate(examples, predictions)
    #print multilabel.toStringConcise(title="multilabel")

    bestThrF = [0]
    bestBaseF = [0]
    predFileNames = []
    for filename in os.listdir(predictionsDir):
        if "predictions" in filename:
            predFileNames.append((int(filename.rsplit("_")[-1]), filename))
    predFileNames.sort()
    for predFileName in predFileNames:
        predictionsTemp = ExampleUtils.loadPredictions(
            os.path.join(predictionsDir, predFileName[1]))
        predictions = []
        for prediction in predictionsTemp:
            predictions.append(prediction)

        baseEv = AveragingMultiClassEvaluator(None, None, classSet)
        baseEv._calculate(examples, predictions)
        print "============================"
        print predFileName[1]
        print "============================"
        #print baseEv.toStringConcise(title="baseline")

        baseLineF = baseEv.microF.fscore
        for step in [0]:
            for classId in [1]:  #classIds:
                cls = None
                if classSet != None:
                    cls = classSet.getName(classId)
                else:
                    cls = str(classId)
                bestF = thresholdClass(examples, predictions, classId,
                                       baseLineF)
                for prediction in predictions:
                    prediction[classId] -= bestF[2][0] + 0.00000001
                changed = 0
                for prediction in predictions:
                    maxVal = -999999
                    maxClass = None
                    for i in range(1, len(prediction)):
                        if prediction[i] > maxVal:
                            maxVal = prediction[i]
                            maxClass = i
                    if maxClass != prediction[0]:
                        prediction[0] = maxClass
                        changed += 1
                print step, cls, "changed", changed, bestF[0]
                baseLineF = bestF[0]

        if bestF[0] > bestThrF[0]:
            bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2],
                        bestF[3])
        if baseEv.microF.fscore > bestBaseF[0]:
            bestBaseF = (baseEv.microF.fscore, predFileName[1],
                         baseEv.microF.toStringConcise())

        print "-------- Baseline ------------"
        print baseEv.toStringConcise()
        print "-------- Best ------------"
        print bestF[0], bestF[1], bestF[2]
        print bestF[3]
        thEv = AveragingMultiClassEvaluator(None, None, classSet)
        thEv._calculate(examples, predictions)
        print thEv.toStringConcise()

    print "=============== All Best ==============="
    print "Threshold", bestThrF
    print "Base", bestBaseF
    sys.exit()

    memPredictions = []
    bestEv = baseEv
    bestPair = [None, None, None]
    for p in predictions:
        memPredictions.append(p)
    for pair in pairs:
        modifier = pair[0] + 0.00000001
        changedClass = 0
        for pred in memPredictions:
            negPred = pred[1] - modifier
            maxVal = negPred
            maxClass = 1
            for i in range(2, len(pred)):
                if pred[i] > maxVal:
                    maxVal = pred[i]
                    maxClass = i
            if pred[0] != maxClass:
                changedClass += 1
            pred[0] = maxClass
        ev = AveragingMultiClassEvaluator(None)
        ev._calculate(examples, memPredictions)
        print pair[0], pair[2], changedClass
        print ev.toStringConcise()
        if ev.compare(bestEv) == 1:
            print "Improved"
            bestPair = pair
            bestEv = ev

    print "---------------------------------------------"
    print baseEv.toStringConcise(title="baseline")
    print bestPair[0], bestPair[2]
    print bestEv.toStringConcise(title="best")
示例#10
0
    variantExamples = ExampleUtils.readExamples(
        os.path.join(options.variant, "test-triggers.examples"))

    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(
        os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(
        os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(
        os.path.join(options.variant, "test-triggers.examples.class_names"))

    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(
            variantClassSet.getName(example[1]))
        newFeatures = {}
        for k, v in example[2].iteritems():
            newFeatures[invariantFeatureSet.getId(
                variantFeatureSet.getName(k))] = v
        example[2] = newFeatures

    ExampleUtils.writeExamples(
        variantExamples, os.path.join(options.variant,
                                      "realignedExamples.txt"))