Exemplo n.º 1
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        global speculationWords

        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        self.specWords, self.specWordStems = readWords(speculationWords)

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            self.gazetteer = None
        self.styles = self.getParameters(style, {
            "classification": "multiclass",
            "speculation_words": True
        }, {"classification": ("multiclass", "speculation", "negation")})
Exemplo n.º 2
0
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     if gazetteer != None:
         print >> sys.stderr, "Loading gazetteer from", gazetteer
         self.gazetteer=Gazetteer.loadGztr(gazetteer)
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     
     self.pathGazetteer=None
     self.pathGazetteerDependencies = None
     self.pathGazetteerPairs = None
     if pathGazetteer != None:
         print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
         self.pathGazetteer=PathGazetteer.load(pathGazetteer)
         self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer)
         self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer)
     else:
         print >> sys.stderr, "No path gazetteer loaded"
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     self.negFrac = negFrac
     print >> sys.stderr, "Downsampling negatives to", negFrac
     self.negRand = random.Random()
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     if True:#"noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     #if "ontology" in self.styles:
     #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     
     self.eventsByOrigId = {}
     self.headTokensByOrigId = {}
     self.interSentenceEvents = set()
     
     self.examplesByEventOrigId = {}
     self.skippedByType = {}
     self.skippedByTypeAndReason = {}
     self.builtByType = {}
     
     self.gazMatchCache = {}
Exemplo n.º 3
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = self.getParameters(style, [
            "rel_features", "wordnet", "bb_features", "giuliano",
            "epi_merge_negated", "limit_merged_types", "genia_task1",
            "build_for_nameless", "pos_only", "all_tokens", "names",
            "pos_pairs", "linear_ngrams", "phospho"
        ])
        #        if "selftrain_group" in self.styles:
        #            self.selfTrainGroups = set()
        #            if "selftrain_group-1" in self.styles:
        #                self.selfTrainGroups.add("-1")
        #            if "selftrain_group0" in self.styles:
        #                self.selfTrainGroups.add("0")
        #            if "selftrain_group1" in self.styles:
        #                self.selfTrainGroups.add("1")
        #            if "selftrain_group2" in self.styles:
        #                self.selfTrainGroups.add("2")
        #            if "selftrain_group3" in self.styles:
        #                self.selfTrainGroups.add("3")
        #            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(
                PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
Exemplo n.º 4
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert( classSet.getId("neg") == 1 )
        if featureSet == None:
            featureSet = IdSet()
        
        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName!=None:
            self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer=None
        self.styles = self.getParameters(style, ["rel_features", "wordnet", "bb_features", "giuliano", 
                                          "epi_merge_negated", "limit_merged_types", "genia_task1",
                                          "build_for_nameless", "pos_only", "all_tokens",
                                          "names", "pos_pairs", "linear_ngrams", "phospho"])
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()
        
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
Exemplo n.º 6
0
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >>sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features",
            "typed",
            "directed",
            "no_linear",
            "entities",
            "genia_limits",
            "noMasking",
            "maxFeatures",
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder

            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
Exemplo n.º 7
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noMasking", "maxFeatures"
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
Exemplo n.º 8
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
     
     self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
Exemplo n.º 9
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     global speculationWords
     
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     self.specWords, self.specWordStems = readWords(speculationWords) 
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         self.gazetteer=None
     self.styles = self.getParameters(style, {"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
Exemplo n.º 10
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None,
                 gazetteer=None,
                 pathGazetteer=None,
                 negFrac=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        if gazetteer != None:
            print >> sys.stderr, "Loading gazetteer from", gazetteer
            self.gazetteer = Gazetteer.loadGztr(gazetteer)
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None

        self.pathGazetteer = None
        self.pathGazetteerDependencies = None
        self.pathGazetteerPairs = None
        if pathGazetteer != None:
            print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
            self.pathGazetteer = PathGazetteer.load(pathGazetteer)
            self.pathGazetteerDependencies = PathGazetteer.getDependencies(
                self.pathGazetteer)
            self.pathGazetteerPairs = PathGazetteer.getPairs(
                self.pathGazetteer)
        else:
            print >> sys.stderr, "No path gazetteer loaded"

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        self.negFrac = negFrac
        print >> sys.stderr, "Downsampling negatives to", negFrac
        self.negRand = random.Random()

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if True:  #"noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types

        self.eventsByOrigId = {}
        self.headTokensByOrigId = {}
        self.interSentenceEvents = set()

        self.examplesByEventOrigId = {}
        self.skippedByType = {}
        self.skippedByTypeAndReason = {}
        self.builtByType = {}

        self.gazMatchCache = {}