示例#1
0
    def __init__(self,
                 style=["typed", "directed"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        if style.find(",") != -1:
            style = style.split(",")
        self.styles = style

        self.negFrac = None
        self.posPairGaz = POSPairGazetteer()
        for s in style:
            if s.find("negFrac") != -1:
                self.negFrac = float(s.split("_")[-1])
                print >> sys.stderr, "Downsampling negatives to", self.negFrac
                self.negRand = random.Random(15)
            elif s.find("posPairGaz") != -1:
                self.posPairGaz = POSPairGazetteer(
                    loadFrom=s.split("_", 1)[-1])

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if "ontology" in self.styles:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if "nodalida" in self.styles:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        #IF LOCAL
        if "bioinfer_limits" in self.styles:
            self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        #ENDIF
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#2
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        global speculationWords

        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        self.specWords, self.specWordStems = readWords(speculationWords)

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            self.gazetteer = None
        self.styles = self.getParameters(style, {
            "classification": "multiclass",
            "speculation_words": True
        }, {"classification": ("multiclass", "speculation", "negation")})
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
示例#4
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        assert (classSet.getId("neg") == 1)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self._setDefaultParameters([
            "rel_features", "wordnet", "bb_features", "giuliano",
            "epi_merge_negated", "limit_merged_types", "genia_task1",
            "build_for_nameless", "pos_only", "all_tokens", "names",
            "pos_pairs", "linear_ngrams", "phospho"
        ])
        self.styles = self.getParameters(style)
        #        if "selftrain_group" in self.styles:
        #            self.selfTrainGroups = set()
        #            if "selftrain_group-1" in self.styles:
        #                self.selfTrainGroups.add("-1")
        #            if "selftrain_group0" in self.styles:
        #                self.selfTrainGroups.add("0")
        #            if "selftrain_group1" in self.styles:
        #                self.selfTrainGroups.add("1")
        #            if "selftrain_group2" in self.styles:
        #                self.selfTrainGroups.add("2")
        #            if "selftrain_group3" in self.styles:
        #                self.selfTrainGroups.add("3")
        #            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(
            )
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
示例#6
0
def run(EvaluatorClass,
        inputCorpusFile,
        goldCorpusFile,
        parse,
        tokenization=None,
        target="both",
        entityMatchFunction=compareEntitiesSimple,
        removeIntersentenceInteractions=False,
        errorMatrix=False,
        verbose=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                    locked=True)
        classSets["interaction"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                         locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")

    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = SentenceGraph.loadCorpus(
            goldCorpusFile, parse, tokenization, False,
            removeIntersentenceInteractions)
    predictedCorpusElements = SentenceGraph.loadCorpus(
        inputCorpusFile, parse, tokenization, False,
        removeIntersentenceInteractions)

    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass,
                          predictedCorpusElements,
                          goldCorpusElements,
                          target,
                          classSets,
                          negativeClassId,
                          entityMatchFunction,
                          errorMatrix=errorMatrix,
                          verbose=verbose)
示例#7
0
 def __init__(self, style=None, classSet=None, featureSet=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     self.styles = style
     self.timerBuildExamples = Timer(False)
     self.timerCrawl = Timer(False)
     self.timerCrawlPrecalc = Timer(False)
     self.timerMatrix = Timer(False)
     self.timerMatrixPrecalc = Timer(False)
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self.styles = style
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
示例#9
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
示例#10
0
    def __init__(self, examples=None, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"])

        self.classSet = classSet
        self.results = None
        self.internal = None
        if predictions != None:
            for example in examples:
                if example[3] != None:
                    print >> sys.stderr, "ChemProt Evaluator:"
                    self._calculateExamples(examples, predictions)
                else:
                    print >> sys.stderr, "No example extra info, skipping ChemProt evaluation"
                break
            self.internal = AveragingMultiClassEvaluator(
                examples, predictions, classSet)
            print >> sys.stderr, "AveragingMultiClassEvaluator:"
            print >> sys.stderr, self.internal.toStringConcise()
示例#11
0
 def __init__(self, style):
     ExampleBuilder.__init__(self)
     self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
     self.style = style
     if not "binary" in style:
         self.classSet = IdSet(1)
         assert (self.classSet.getId("neg") == 1)
示例#12
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
示例#13
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(
            SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse,
            SharedTaskEvaluator.tokenization)
        # Build interaction xml
        xml = BioTextExampleWriter.write(
            examples, predictions, SharedTaskEvaluator.corpusElements, None,
            SharedTaskEvaluator.ids + ".class_names",
            SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
        # Convert to GENIA format
        gifxmlToGenia(xml,
                      SharedTaskEvaluator.geniaDir,
                      task=SharedTaskEvaluator.task,
                      verbose=False)
        # Use GENIA evaluation tool
        self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir,
                                          task=SharedTaskEvaluator.task,
                                          evaluations=["approximate"],
                                          verbose=False)
示例#14
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noMasking", "maxFeatures"
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
示例#15
0
def readARFF(filename):
    featureSet = IdSet(1)
    classSet = IdSet(0)
    f = open(filename,"rt")
    inData = False
    lines = f.readlines()
    counter = ProgressCounter(len(lines),"ARFFLine")
    examples = []
    for line in lines:
        counter.update(string="Processing line " + str(counter.current + 1) + ": ")
        line = line.strip()
        if len(line) == 0 or line[0] == "%":
            continue
        elif line[0] == "@":
            #print line
            category = line.split()[0].lower()
            if category == "@attribute":
                category, name, type = line.split()
                assert(not inData)
                if name.lower() == "class":
                    name = name.lower()
                    classNames = type[1:-1].split(",")
                    assert(len(classNames)==2)
                    classSet.defineId(classNames[0].strip(),1)
                    classSet.defineId(classNames[1].strip(),-1)
                featureSet.getId(name)
            elif category.lower() == "@relation":
                assert(not inData)
            elif category == "@data":
                inData = True
        else:
            assert(inData)
            count = 1
            features = {}
            for column in line.split(","):
                if featureSet.getName(count) != "class":
                    features[count] = float(column)
                else:
                    classId = classSet.getId(column, False)
                    assert(classId != None)
                count += 1
            exampleCount = str(len(examples))
            exampleId = "BreastCancer.d" + exampleCount + ".s0.x0"
            examples.append([exampleId,classId,features,{}])
                    
    return examples
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self._setDefaultParameters(["co_limits"])
        self.styles = self.getParameters(style)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
        self.phraseTypeCounts = {}
示例#17
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
     
     self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
示例#18
0
 def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
     # Class ids
     #print classIds
     #print featureIds
     if classIds != None and os.path.exists(classIds):
         print >> sys.stderr, "Using predefined class names from", classIds
         classSet = IdSet(allowNewIds=allowNewIds)
         classSet.load(classIds)
     else:
         print >> sys.stderr, "No predefined class names"
         classSet = None
     # Feature ids
     if featureIds != None and os.path.exists(featureIds):
         print >> sys.stderr, "Using predefined feature names from", featureIds
         featureSet = IdSet(allowNewIds=allowNewIds)
         featureSet.load(featureIds)
     else:
         print >> sys.stderr, "No predefined feature names"
         featureSet = None
     return classSet, featureSet
示例#19
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(0)
        if featureSet == None:
            featureSet = IdSet(0)

        ExampleBuilder.__init__(self, classSet, featureSet)

        self.featureIds = self.featureSet
        self.labelIds = self.classSet

        self._setDefaultParameters([
            "directed", "undirected", "cutoff", "annotated_only",
            "all_positive", "wv", "epochs", "html", "autoencode", "lr",
            "patience"
        ])
        self.styles = self.getParameters(style)
        if self.styles["cutoff"]:
            self.styles["cutoff"] = int(self.styles["cutoff"])

        self.wvIndices = None
        self.embeddingMatrices = None
        if self.styles.get("wv") != None:
            indexPath = self.styles.get("wv") + "-indices.json.gz"
            if not os.path.exists(indexPath):
                indexPath = os.path.join(Settings.DATAPATH, "wv", indexPath)
            print >> sys.stderr, "Loading word vector indices from", indexPath
            with gzip.open(indexPath, "rt") as f:
                self.wvIndices = json.load(f)["indices"]
            self.embeddingMatrices = []

        self.dimMatrix = 32
        self.rangeMatrix = range(self.dimMatrix)
        self.featureMatrices = []
        self.labelMatrices = []
        self.tokenLists = []
示例#20
0
def addExamples(exampleFile, predictionFile, classFile, matrix):
    classSet = IdSet(filename=classFile)
    f = open(predictionFile, "rt")
    for example in ExampleUtils.readExamples(exampleFile, False):
        pred = int(f.readline().split()[0])
        predClasses = classSet.getName(pred)
        goldClasses = classSet.getName(example[1])
        for predClass in predClasses.split("---"):
            for goldClass in goldClasses.split("---"):
                matrix[predClass][goldClass]
                matrix[goldClass][predClass] += 1
    f.close()
 def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     #if "noAnnType" in self.styles:
     self.multiEdgeFeatureBuilder.noAnnType = True
     #if "noMasking" in self.styles:
     self.multiEdgeFeatureBuilder.maskNamedEntities = False
     #if "maxFeatures" in self.styles:
     self.multiEdgeFeatureBuilder.maximum = True
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder.useNonNameEntities = False
示例#22
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)

        self.counts = {}
        self.countsPerType = {}
        self.untypedCounts = {}
        self.tokenCounts = {}
示例#23
0
    def __init__(self, classSet=None, featureSet=None):
        if (type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet

        if (type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet

        self.featureTag = ""
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None

        self.styles = {}
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False
示例#24
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        self.dataByClass = defaultdict(EvaluationData)

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
示例#25
0
 def __init__(self, examples=None, predictions=None, classSet=None):
     if type(classSet) == types.StringType:  # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType:  # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType:  # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     #self.examples = examples
     #self.predictions = predictions
     self.truePositives = 0
     self.falsePositives = 0
     self.trueNegatives = 0
     self.falseNegatives = 0
     self.precision = None
     self.recall = None
     self.fScore = None
     self.AUC = None
     self.type = "binary"
     if predictions != None:
         self._calculate(examples, predictions)
示例#26
0
 def __init__(self, examples, predictions=None, classSet=None):
     if type(classSet) == types.StringType: # class names are in file
         classSet = IdSet(filename=classSet)
     if type(predictions) == types.StringType: # predictions are in file
         predictions = ExampleUtils.loadPredictions(predictions)
     if type(examples) == types.StringType: # examples are in file
         examples = ExampleUtils.readExamples(examples, False)
     
     corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization)
     # Build interaction xml
     xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization)
     xml = ix.splitMergedElements(xml, None)
     xml = ix.recalculateIds(xml, None, True)
     #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization)
     # Convert to GENIA format
     STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2")
     #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False)
     # Use GENIA evaluation tool
     self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag)
     corpusElements = None
示例#27
0
def getClassSet(rows, classSet=None):
    from Core.IdSet import IdSet
    classNames = set()
    for row in rows:
        classNames.add(row["class"])
        classNames.add(row["prediction"])
    
    # In the case of multiclass, give integer id:s for the classes
    if classSet == None:
        classSet = IdSet()
        assert(not ("1" in classNames and "neg" in classNames))
        assert("1" in classNames or "neg" in classNames)
        if "1" in classNames:
            classSet.defineId("1",1)
        else:
            classSet.defineId("neg",1)
    for i in sorted(list(classNames)):
        if i != "1" and i != "neg":
            classSet.getId(i)
    return classSet
示例#28
0
    def writeXML(self,
                 examples,
                 predictions,
                 corpus,
                 outputFile,
                 classSet=None,
                 parse=None,
                 tokenization=None,
                 goldCorpus=None,
                 exampleStyle=None,
                 structureAnalyzer=None):
        """
        Writes task 3 examples to interaction XML. Assumes task 3 classification
        is done with SVMMulticlass Classifier, used for two classes.
        """
        print >> sys.stderr, "Adding task 3 to Interaction XML"
        examples, predictions = self.loadExamples(examples, predictions)

        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        classIds = None
        if classSet != None:
            classIds = classSet.getIds()

        corpusTree = ETUtils.ETFromObj(corpus)
        corpusRoot = corpusTree.getroot()

        # Determine subtask
        task3Type = None
        for example in examples:
            assert example[3].has_key("t3type")
            task3Type = example[3]["t3type"]
            break
        if task3Type == None:
            if outputFile != None:
                print >> sys.stderr, "Writing corpus to", outputFile
                ETUtils.write(corpusRoot, outputFile)
            return corpusTree
        assert task3Type in ["multiclass", "speculation", "negation"]

        # Remove the task 3 subtask information if it already exists
        for entity in corpusRoot.getiterator("entity"):
            if task3Type == "multiclass":
                entity.set("speculation", "False")
                entity.set("negation", "False")
            elif task3Type == "speculation":
                entity.set("speculation", "False")
            else:  # task3Type == "negation"
                entity.set("negation", "False")

        specMap = {}
        negMap = {}
        for example, prediction in itertools.izip(examples, predictions):
            assert example[3]["xtype"] == "task3"
            if example[3]["t3type"] == "multiclass":
                if isinstance(prediction, dict):
                    encoded = prediction["prediction"]
                    predictedModifiers = [
                        classSet.getName(i) for i in range(len(encoded))
                        if encoded[i] == 1
                    ]
                else:
                    predictedClassName = classSet.getName(prediction[0])
                    predictedModifiers = ""
                    if predictedClassName != "neg":
                        predictedModifiers = predictedClassName.split("---")
                if "negation" in predictedModifiers:
                    assert not negMap.has_key(example[3]["entity"])
                    negMap[example[3]["entity"]] = (True, prediction)
                if "speculation" in predictedModifiers:
                    assert not specMap.has_key(example[3]["entity"])
                    specMap[example[3]["entity"]] = (True, prediction)
            else:
                if example[3]["t3type"] == "speculation":
                    map = specMap
                else:
                    map = negMap
                if prediction[0] != 1:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (True, prediction)
                else:
                    assert not map.has_key(example[3]["entity"])
                    map[example[3]["entity"]] = (False, prediction)

        for entity in corpusRoot.getiterator("entity"):
            eId = entity.get("id")
            if task3Type == "multiclass":
                if specMap.has_key(eId):
                    entity.set("speculation", str(specMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            specMap[eId][1], classSet, classIds))
                if negMap.has_key(eId):
                    entity.set("negation", str(negMap[eId][0]))
                    entity.set(
                        "modConf",
                        self.getPredictionStrengthString(
                            negMap[eId][1], classSet, classIds))
            else:
                if task3Type == "speculation":
                    if specMap.has_key(eId):
                        entity.set("speculation", str(specMap[eId][0]))
                        entity.set(
                            "specConf",
                            self.getPredictionStrengthString(
                                specMap[eId][1], classSet, classIds, [""]))
                elif task3Type == "negation":
                    if negMap.has_key(eId):
                        entity.set("negation", str(negMap[eId][0]))
                        entity.set(
                            "negConf",
                            self.getPredictionStrengthString(
                                negMap[eId][1], classSet, classIds,
                                ["", "speculation"]))

        # Write corpus
        if outputFile != None:
            print >> sys.stderr, "Writing corpus to", outputFile
            ETUtils.write(corpusRoot, outputFile)
        return corpusTree
示例#29
0
 def __init__(self, style=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
     
     # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures
     self._setDefaultParameters([
         "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features",
         "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
         "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features",
         "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", 
         "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", 
         "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
         "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only",
         "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", 
         "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities",
         "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra",
         "entity_extra"])
     self.styles = self.getParameters(style)
     #if style == None: # no parameters given
     #    style["typed"] = style["directed"] = style["headsOnly"] = True
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles)
     # NOTE Temporarily re-enabling predicted range
     #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
     if self.styles["graph_kernel"]:
         from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
         self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
     if self.styles["noAnnType"]:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if self.styles["mask_nodes"]:
         self.multiEdgeFeatureBuilder.maskNamedEntities = True
     else:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if not self.styles["limit_features"]:
         self.multiEdgeFeatureBuilder.maximum = True
     if self.styles["genia_task1"]:
         self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     if self.styles["ontology"]:
         self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     if self.styles["ontobiotope_features"]:
         self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
     if self.styles["nodalida"]:
         self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
     if self.styles["bacteria_renaming"]:
         self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
     if not self.styles["no_trigger_features"]:
         self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles)
         self.triggerFeatureBuilder.useNonNameEntities = True
         if self.styles["noAnnType"]:
             self.triggerFeatureBuilder.noAnnType = True
         if self.styles["genia_task1"]:
             self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
         #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
     if self.styles["rel_features"]:
         self.relFeatureBuilder = RELFeatureBuilder(featureSet)
     if self.styles["drugbank_features"]:
         self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
     if self.styles["evex"]:
         self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
     if self.styles["wordnet"]:
         self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
     if self.styles["wordvector"]:
         self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles)
     if self.styles["giuliano"]:
         self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
     self.types = types
     if self.styles["random"]:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
 def __init__(self):
     ExampleBuilder.__init__(self)
     self.classSet = IdSet(1)
     assert (self.classSet.getId("neg") == 1)