def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        defaultParameters["no_arg_count_upper_limit"] = False
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
示例#2
0
    def __init__(self,
                 style=["typed", "directed"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        if style.find(",") != -1:
            style = style.split(",")
        self.styles = style

        self.negFrac = None
        self.posPairGaz = POSPairGazetteer()
        for s in style:
            if s.find("negFrac") != -1:
                self.negFrac = float(s.split("_")[-1])
                print >> sys.stderr, "Downsampling negatives to", self.negFrac
                self.negRand = random.Random(15)
            elif s.find("posPairGaz") != -1:
                self.posPairGaz = POSPairGazetteer(
                    loadFrom=s.split("_", 1)[-1])

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if "ontology" in self.styles:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if "nodalida" in self.styles:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        #IF LOCAL
        if "bioinfer_limits" in self.styles:
            self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        #ENDIF
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if "random" in self.styles:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#3
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        global speculationWords

        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        self.specWords, self.specWordStems = readWords(speculationWords)

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            self.gazetteer = None
        self.styles = self.getParameters(style, {
            "classification": "multiclass",
            "speculation_words": True
        }, {"classification": ("multiclass", "speculation", "negation")})
示例#4
0
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType:  # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType:  # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType:  # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()

        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        #style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        defaultNone = ["binary", "trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"]
        defaultParameters = {}
        for name in defaultNone:
            defaultParameters[name] = None
        defaultParameters["keep_intersentence"] = False
        defaultParameters["keep_intersentence_gold"] = True
        self.styles = self._setDefaultParameters(defaultParameters)
        self.styles = self.getParameters(style)
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
    def __init__(self, examples, predictions=None, classSet=None):
        if type(classSet) == types.StringType: # class names are in file
            classSet = IdSet(filename=classSet)
        if type(predictions) == types.StringType: # predictions are in file
            predictions = ExampleUtils.loadPredictions(predictions)
        if type(examples) == types.StringType: # examples are in file
            examples = ExampleUtils.readExamples(examples, False)

        self.classSet = classSet
        # define class ids in alphabetical order
        self.classSet = classSet
        if classSet != None:
            classNames = sorted(classSet.Ids.keys())
        else:
            classNames = []
        # make an ordered list of class ids
        self.classes = []
        for className in classNames:
            self.classes.append(classSet.getId(className))
        # create data structures for per-class evaluation
        self.dataByClass = {}
        for cls in self.classes:
            self.dataByClass[cls] = EvaluationData()
        # hack for unnamed classes
        if len(self.dataByClass) == 0:
            self.dataByClass[1] = EvaluationData()
            self.dataByClass[2] = EvaluationData()
        
        #self.untypedUndirected = None
        self.untypedCurrentMajorId = None
        self.untypedPredictionQueue = []
        self.untypedUndirected = EvaluationData()
        #self.AUC = None
        if predictions != None:
            self._calculate(examples, predictions)
示例#7
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     if "noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     if "random" in self.styles:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#9
0
    def __init__(self, style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures", length=None, types=[], featureSet=None, classSet=None):
        # reset style regardless of input
        style="trigger_features:typed:directed:no_linear:entities:genia_limits:noMasking:maxFeatures"
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, ["trigger_features","typed","directed","no_linear","entities","genia_limits",
            "noAnnType", "noMasking", "maxFeatures", "no_merge", "disable_entity_features", 
            "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features"])
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        self.multiEdgeFeatureBuilder.noAnnType = self.styles["noAnnType"]
        self.multiEdgeFeatureBuilder.maskNamedEntities = not self.styles["noMasking"]
        self.multiEdgeFeatureBuilder.maximum = self.styles["maxFeatures"]
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = True
示例#10
0
    def __init__(self, style=["typed", "directed", "headsOnly"], length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert classSet.getId("neg") == 1

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True
        # self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        # if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert self.pathLengths == None
        self.types = types
示例#11
0
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()
        
        ExampleBuilder.__init__(self, classSet, featureSet)
        assert( classSet.getId("neg") == 1 )
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName!=None:
            self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer=None
        self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", 
                                  "epi_merge_negated", "limit_merged_types", "genia_task1",
                                  "names", "build_for_nameless", "skip_for_nameless",
                                  "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", 
                                  "phospho", "drugbank_features", "ddi13_features", "metamap", 
                                  "only_types", "ontobiotope_features", "bb_spans", "w2v",
                                  "no_context"])
        self.styles = self.getParameters(style)
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()
        
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens()
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        if self.styles["drugbank_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["ontobiotope_features"]:
            self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
        if self.styles["w2v"]:
            self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet)
示例#12
0
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None, gazetteer=None, pathGazetteer=None, negFrac=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     if gazetteer != None:
         print >> sys.stderr, "Loading gazetteer from", gazetteer
         self.gazetteer=Gazetteer.loadGztr(gazetteer)
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     
     self.pathGazetteer=None
     self.pathGazetteerDependencies = None
     self.pathGazetteerPairs = None
     if pathGazetteer != None:
         print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
         self.pathGazetteer=PathGazetteer.load(pathGazetteer)
         self.pathGazetteerDependencies = PathGazetteer.getDependencies(self.pathGazetteer)
         self.pathGazetteerPairs = PathGazetteer.getPairs(self.pathGazetteer)
     else:
         print >> sys.stderr, "No path gazetteer loaded"
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     self.negFrac = negFrac
     print >> sys.stderr, "Downsampling negatives to", negFrac
     self.negRand = random.Random()
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     if True:#"noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     #if "ontology" in self.styles:
     #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     
     self.eventsByOrigId = {}
     self.headTokensByOrigId = {}
     self.interSentenceEvents = set()
     
     self.examplesByEventOrigId = {}
     self.skippedByType = {}
     self.skippedByTypeAndReason = {}
     self.builtByType = {}
     
     self.gazMatchCache = {}
示例#13
0
def readARFF(filename):
    featureSet = IdSet(1)
    classSet = IdSet(0)
    f = open(filename,"rt")
    inData = False
    lines = f.readlines()
    counter = ProgressCounter(len(lines),"ARFFLine")
    examples = []
    for line in lines:
        counter.update(string="Processing line " + str(counter.current + 1) + ": ")
        line = line.strip()
        if len(line) == 0 or line[0] == "%":
            continue
        elif line[0] == "@":
            #print line
            category = line.split()[0].lower()
            if category == "@attribute":
                category, name, type = line.split()
                assert(not inData)
                if name.lower() == "class":
                    name = name.lower()
                    classNames = type[1:-1].split(",")
                    assert(len(classNames)==2)
                    classSet.defineId(classNames[0].strip(),1)
                    classSet.defineId(classNames[1].strip(),-1)
                featureSet.getId(name)
            elif category.lower() == "@relation":
                assert(not inData)
            elif category == "@data":
                inData = True
        else:
            assert(inData)
            count = 1
            features = {}
            for column in line.split(","):
                if featureSet.getName(count) != "class":
                    features[count] = float(column)
                else:
                    classId = classSet.getId(column, False)
                    assert(classId != None)
                count += 1
            exampleCount = str(len(examples))
            exampleId = "BreastCancer.d" + exampleCount + ".s0.x0"
            examples.append([exampleId,classId,features,{}])
                    
    return examples
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        assert (classSet.getId("neg") == 1)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self._setDefaultParameters([
            "rel_features", "wordnet", "bb_features", "giuliano",
            "epi_merge_negated", "limit_merged_types", "genia_task1",
            "build_for_nameless", "pos_only", "all_tokens", "names",
            "pos_pairs", "linear_ngrams", "phospho"
        ])
        self.styles = self.getParameters(style)
        #        if "selftrain_group" in self.styles:
        #            self.selfTrainGroups = set()
        #            if "selftrain_group-1" in self.styles:
        #                self.selfTrainGroups.add("-1")
        #            if "selftrain_group0" in self.styles:
        #                self.selfTrainGroups.add("0")
        #            if "selftrain_group1" in self.styles:
        #                self.selfTrainGroups.add("1")
        #            if "selftrain_group2" in self.styles:
        #                self.selfTrainGroups.add("2")
        #            if "selftrain_group3" in self.styles:
        #                self.selfTrainGroups.add("3")
        #            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["wordnet"]:
            self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
        if self.styles["bb_features"]:
            self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(
            )
            #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames())
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
示例#15
0
def getClassSet(rows, classSet=None):
    from Core.IdSet import IdSet
    classNames = set()
    for row in rows:
        classNames.add(row["class"])
        classNames.add(row["prediction"])
    
    # In the case of multiclass, give integer id:s for the classes
    if classSet == None:
        classSet = IdSet()
        assert(not ("1" in classNames and "neg" in classNames))
        assert("1" in classNames or "neg" in classNames)
        if "1" in classNames:
            classSet.defineId("1",1)
        else:
            classSet.defineId("neg",1)
    for i in sorted(list(classNames)):
        if i != "1" and i != "neg":
            classSet.getId(i)
    return classSet
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()         
     ExampleBuilder.__init__(self, classSet, featureSet)
     
     self.styles = style
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder.useNonNameEntities = False
示例#17
0
def getClassSet(rows, classSet=None):
    from Core.IdSet import IdSet
    classNames = set()
    for row in rows:
        classNames.add(row["class"])
        classNames.add(row["prediction"])
    
    # In the case of multiclass, give integer id:s for the classes
    if classSet == None:
        classSet = IdSet()
        assert(not ("1" in classNames and "neg" in classNames))
        assert("1" in classNames or "neg" in classNames)
        if "1" in classNames:
            classSet.defineId("1",1)
        else:
            classSet.defineId("neg",1)
    for i in sorted(list(classNames)):
        if i != "1" and i != "neg":
            classSet.getId(i)
    return classSet
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self._setDefaultParameters(["co_limits"])
        self.styles = self.getParameters(style)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
        self.phraseTypeCounts = {}
 def __init__(self, style=["typed","directed"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     if style.find(",") != -1:
         style = style.split(",")
     self.styles = style
     
     self.negFrac = None
     self.posPairGaz = POSPairGazetteer()
     for s in style:
         if s.find("negFrac") != -1:      
             self.negFrac = float(s.split("_")[-1])
             print >> sys.stderr, "Downsampling negatives to", self.negFrac
             self.negRand = random.Random(15)
         elif s.find("posPairGaz") != -1:
             self.posPairGaz = POSPairGazetteer(loadFrom=s.split("_", 1)[-1])
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     if "graph_kernel" in self.styles:
         from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
         self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
     if "noAnnType" in self.styles:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if "noMasking" in self.styles:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if "maxFeatures" in self.styles:
         self.multiEdgeFeatureBuilder.maximum = True
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     if "ontology" in self.styles:
         self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     if "nodalida" in self.styles:
         self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
     #IF LOCAL
     if "bioinfer_limits" in self.styles:
         self.bioinferOntologies = OntologyUtils.getBioInferTempOntology()
         #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
     #ENDIF
     self.pathLengths = length
     assert(self.pathLengths == None)
     self.types = types
     if "random" in self.styles:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#20
0
 def __init__(self, style=None, classSet=None, featureSet=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     self.styles = style
     self.timerBuildExamples = Timer(False)
     self.timerCrawl = Timer(False)
     self.timerCrawlPrecalc = Timer(False)
     self.timerMatrix = Timer(False)
     self.timerMatrixPrecalc = Timer(False)
示例#21
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self.styles = style
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
    def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert classSet.getId("neg") == 1
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        # gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >>sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >>sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features",
            "typed",
            "directed",
            "no_linear",
            "entities",
            "genia_limits",
            "noMasking",
            "maxFeatures",
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder

            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()
        ExampleBuilder.__init__(self, classSet, featureSet)

        self._setDefaultParameters(["co_limits"])
        self.styles = self.getParameters(style)
        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        self.triggerFeatureBuilder.useNonNameEntities = False
        self.phraseTypeCounts = {}
示例#25
0
    def __init__(self,
                 style=None,
                 classSet=None,
                 featureSet=None,
                 gazetteerFileName=None,
                 skiplist=None):
        if classSet == None:
            classSet = IdSet(1)
        assert (classSet.getId("neg") == 1)
        if featureSet == None:
            featureSet = IdSet()

        ExampleBuilder.__init__(self, classSet, featureSet)
        #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
        if gazetteerFileName != None:
            self.gazetteer = Gazetteer.loadGztr(gazetteerFileName)
            print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None
        self.styles = style

        self.skiplist = set()
        if skiplist != None:
            f = open(skiplist, "rt")
            for line in f.readlines():
                self.skiplist.add(line.strip())
            f.close()

        self.styles = [
            "trigger_features", "typed", "directed", "no_linear", "entities",
            "genia_limits", "noMasking", "maxFeatures"
        ]
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if "graph_kernel" in self.styles:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if "noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
示例#26
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         print >> sys.stderr, "No gazetteer loaded"
         self.gazetteer=None
     self.styles = style
     
     self.excludedPOS = ["","(",")",",",".","CC","EX","FW","LS","MD","PDT","POS","PRP","PRP$","RBR","RBS","RP","WDT","WP","WP$","``"]
示例#27
0
 def __init__(self, style=["typed","directed","headsOnly"], length=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     
     self.counts = {}
     self.countsPerType = {}
     self.untypedCounts = {}
     self.tokenCounts = {}
示例#28
0
 def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None):
     global speculationWords
     
     if classSet == None:
         classSet = IdSet(1)
     assert( classSet.getId("neg") == 1 )
     if featureSet == None:
         featureSet = IdSet()
     
     self.specWords, self.specWordStems = readWords(speculationWords) 
     
     ExampleBuilder.__init__(self, classSet, featureSet)
     #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train"
     if gazetteerFileName!=None:
         self.gazetteer=Gazetteer.loadGztr(gazetteerFileName)
         print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName
     else:
         self.gazetteer=None
     self.styles = self.getParameters(style, {"classification":"multiclass", "speculation_words":True}, {"classification":("multiclass", "speculation", "negation")})
 def __init__(self, style=["typed","directed","headsOnly"], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     assert( classSet.getId("neg") == 1 )
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     self.styles = style
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
     #if "noAnnType" in self.styles:
     self.multiEdgeFeatureBuilder.noAnnType = True
     #if "noMasking" in self.styles:
     self.multiEdgeFeatureBuilder.maskNamedEntities = False
     #if "maxFeatures" in self.styles:
     self.multiEdgeFeatureBuilder.maximum = True
     self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
     self.triggerFeatureBuilder.useNonNameEntities = False
示例#30
0
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)

        self.counts = {}
        self.countsPerType = {}
        self.untypedCounts = {}
        self.tokenCounts = {}
示例#31
0
class ExampleBuilder:
    structureAnalyzer = None
    """
    ExampleBuilder is the abstract base class for specialized example builders.
    Example builders take some data and convert it to examples usable by e.g. SVMs.
    An example builder writes three files, an example-file (in extended Joachim's
    SVM format) and .class_names and .feature_names files, which contain the names
    for the class and feature id-numbers. An example builder can also be given
    pre-existing sets of class and feature ids (optionally in files) so that the
    generated examples are consistent with other, previously generated examples.
    """
    def __init__(self, classSet=None, featureSet=None):
        if (type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet

        if (type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet

        self.featureTag = ""
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None

        self.styles = {}
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False

    def hasStyle(self, style):
        return style in self.styles and not self.styles[style]

    def _setDefaultParameters(self, defaults=None, valueLimits=None):
        # Initialize
        if self._defaultParameters == None:
            self._defaultParameters = {}
        if self._parameterValueLimits == None:
            self._parameterValueLimits = {}
        newParameters = Utils.Parameters.get({},
                                             defaults,
                                             valueLimits=valueLimits)
        self._defaultParameters.update(newParameters)
        if valueLimits != None:
            self._parameterValueLimits.update(valueLimits)

    def getParameters(self, parameters):
        return Utils.Parameters.get(parameters,
                                    defaults=self._defaultParameters,
                                    valueLimits=self._parameterValueLimits)

    def setFeature(self, name, value):
        self.features[self.featureSet.getId(self.featureTag + name)] = value

    def getElementCounts(self, filename):
        print >> sys.stderr, "Counting elements:",
        if filename.endswith(".gz"):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")
        counts = {"documents": 0, "sentences": 0}
        for line in f:
            if "<document" in line:
                counts["documents"] += 1
            elif "<sentence" in line:
                counts["sentences"] += 1
        f.close()
        print >> sys.stderr, counts
        return counts

    def saveIds(self):
        if self.classIdFilename != None:
            print >> sys.stderr, "Saving class names to", self.classIdFilename
            self.classSet.write(self.classIdFilename)
        else:
            print >> sys.stderr, "Class names not saved"
        if self.featureIdFilename != None:
            print >> sys.stderr, "Saving feature names to", self.featureIdFilename
            self.featureSet.write(self.featureIdFilename)
        else:
            print >> sys.stderr, "Feature names not saved"

    def processCorpus(self,
                      input,
                      output,
                      gold=None,
                      append=False,
                      allowNewIds=True,
                      structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(
                os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)

        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:  # Entered here - Mu
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:  # Entered here, 1448 - Mu
                self.progress = ProgressCounter(
                    self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")
        # pdb.set_trace()

        # This line generates log below:(getSentences function generates the first 2 lines)
        # Making sentence graphs (GE09.d149.s5): 100.00 % (0:0:1.113)
        # Skipped 381 duplicate interaction edges in SentenceGraphs
        # Defining predicted value range: None - Mu
        self.calculatePredictedRange(
            self.getSentences(input, self.parse, self.tokenization)
        )  # self.parse: mccc; self.tokenization: None

        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles[
                "keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False  # this is True  - Mu
        inputIterator = getCorpusIterator(
            input,
            None,
            self.parse,
            self.tokenization,
            removeIntersentenceInteractions=removeIntersentenceInteractions)

        # pdb.set_trace()
        #goldIterator = []
        if gold != None:  # Entered here - Mu
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles[
                    "keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False  # this is False - Mu
            goldIterator = getCorpusIterator(
                gold,
                None,
                self.parse,
                self.tokenization,
                removeIntersentenceInteractions=
                removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(
                    inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                # pdb.set_trace()
                # see the documentation of function processSentence() in this script
                # inputSentences[1].sentence is the unmerged version
                # inputSentences[1].sentenceGraph is the merged version, meaning that when generating sentenceGraph,
                # duplicated intereactions are removed(actually skipped, not added to the graph, but not really removed) - Mu
                self.processDocument(inputSentences,
                                     goldSentences,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences,
                                     None,
                                     outfile,
                                     structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()

        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Classes:", len(self.classSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(
            self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()

        # Save Ids
        if allowNewIds:
            self.saveIds()

    def processDocument(self,
                        sentences,
                        goldSentences,
                        outfile,
                        structureAnalyzer=None):
        #calculatePredictedRange(self, sentences)
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(
                1, "Building examples (" + sentence.sentence.get("id") + "): ")
            self.processSentence(sentence,
                                 outfile,
                                 goldSentence,
                                 structureAnalyzer=structureAnalyzer)

    def processSentence(self,
                        sentence,
                        outfile,
                        goldSentence=None,
                        structureAnalyzer=None):
        '''
        sentence: Utils.InteractionXML.SentenceElements.SentenceElements instance
        sentence.sentence: Element 'sentence' in the xml file
        '''
        # pdb.set_trace()
        # Process filtering rules
        # does NOT entered here since self.styles["sentenceLimit"] is None - Mu
        if "sentenceLimit" in self.styles and self.styles[
                "sentenceLimit"]:  # Rules for limiting which sentences to process
            # Get the rule list
            limitRules = self.styles["sentenceLimit"]
            if type(limitRules) in types.StringTypes:
                limitRules = [limitRules]
            # Get the list of sentence element attribute names
            sentenceElement = sentence.sentence
            sentenceAttributes = sorted(sentenceElement.attrib.keys())
            # Filter sentences based on matching rules to their attribute values
            for rule in limitRules:
                for sentAttr in sentenceAttributes:
                    # Rule are of the form "attr.value" where "attr" is the name
                    # of the attribute to match, and "value" a substring within
                    # that attribute
                    if rule.startswith(sentAttr +
                                       "."):  # rule matches the attribute
                        value = rule.split(
                            ".", 1)[-1]  # get the value part of the rule
                        if value not in sentenceElement.get(
                                sentAttr
                        ):  # rule value must be a substring of the attribute value
                            return  # discard all sentences that do not match all rules
        # Process the sentence
        if sentence.sentenceGraph != None:
            goldGraph = None
            if goldSentence != None:
                goldGraph = goldSentence.sentenceGraph
            # c, sentenceGraph_return, argCombinations_return = self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer)
            # self.exampleCount += c
            self.exampleCount += self.buildExamplesFromGraph(
                sentence.sentenceGraph,
                outfile,
                goldGraph,
                structureAnalyzer=structureAnalyzer)
        # return sentenceGraph_return, argCombinations_return

    @classmethod
    def run(cls,
            input,
            output,
            parse,
            tokenization,
            style,
            classIds=None,
            featureIds=None,
            gold=None,
            append=False,
            allowNewIds=True,
            structureAnalyzer=None,
            debug=False):
        print >> sys.stderr, "Running", cls.__name__
        print >> sys.stderr, "  input:", input
        if gold != None:
            print >> sys.stderr, "  gold:", gold
        print >> sys.stderr, "  output:", output, "(append:", str(append) + ")"
        print >> sys.stderr, "  add new class/feature ids:", allowNewIds
        if not isinstance(style, types.StringTypes):
            style = Utils.Parameters.toString(style)
        print >> sys.stderr, "  style:", style
        if tokenization == None:
            print >> sys.stderr, "  parse:", parse
        else:
            print >> sys.stderr, "  parse:", parse + ", tokenization:", tokenization
        classSet, featureSet = cls.getIdSets(
            classIds, featureIds, allowNewIds)  #cls.getIdSets(idFileTag)
        builder = cls(style=style, classSet=classSet, featureSet=featureSet)
        builder.debug = debug
        #builder.idFileTag = idFileTag
        builder.classIdFilename = classIds
        builder.featureIdFilename = featureIds
        builder.parse = parse
        builder.tokenization = tokenization
        builder.processCorpus(input,
                              output,
                              gold,
                              append=append,
                              allowNewIds=allowNewIds,
                              structureAnalyzer=structureAnalyzer)
        return builder

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        raise NotImplementedError

    def definePredictedValueRange(self, sentences, elementName):
        pass

    def getPredictedValueRange(self):
        return None

    @classmethod
    def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
        # Class ids
        #print classIds
        #print featureIds
        if classIds != None and os.path.exists(classIds):
            print >> sys.stderr, "Using predefined class names from", classIds
            classSet = IdSet(allowNewIds=allowNewIds)
            classSet.load(classIds)
        else:
            print >> sys.stderr, "No predefined class names"
            classSet = None
        # Feature ids
        if featureIds != None and os.path.exists(featureIds):
            print >> sys.stderr, "Using predefined feature names from", featureIds
            featureSet = IdSet(allowNewIds=allowNewIds)
            featureSet.load(featureIds)
        else:
            print >> sys.stderr, "No predefined feature names"
            featureSet = None
        return classSet, featureSet


#        if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"):
#            print >> sys.stderr, "Using predefined class and feature names"
#            featureSet = IdSet()
#            featureSet.load(idFileTag + ".feature_names.gz")
#            classSet = IdSet()
#            classSet.load(idFileTag + ".class_names")
#            return classSet, featureSet
#        else:
#            print >> sys.stderr, "No predefined class or feature-names"
#            if idFileTag != None:
#                assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag
#                assert(not os.path.exists(idFileTag + ".class_names")), idFileTag
#            return None, None

    def getSentences(self, input, parse, tokenization, removeNameInfo=False):
        # pdb.set_trace()
        # input is the path to the corpus xml file
        if type(input) != types.ListType:  # Program entered here - Mu
            # Load corpus and make sentence graphs
            # pdb.set_trace()
            corpusElements = Core.SentenceGraph.loadCorpus(
                input, parse, tokenization, removeNameInfo=removeNameInfo)
            sentences = []
            for sentence in corpusElements.sentences:
                if sentence.sentenceGraph != None:  # required for event detection
                    sentences.append([sentence.sentenceGraph, None])
            return sentences
        else:  # assume input is already a list of sentences
            assert (removeNameInfo == False)
            return input

    def calculatePredictedRange(self, sentences):
        print >> sys.stderr, "Defining predicted value range:",
        sentenceElements = []
        for sentence in sentences:
            sentenceElements.append(sentence[0].sentenceElement)
        self.definePredictedValueRange(sentenceElements, "entity")
        print >> sys.stderr, self.getPredictedValueRange()
示例#32
0
 def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None):
     """
     Classify examples with a pre-trained model.
     
     @type examples: string (filename) or list (or iterator) of examples
     @param examples: a list or file containing examples in SVM-format
     @type modelPath: string
     @param modelPath: filename of the pre-trained model file
     @type parameters: a dictionary or string
     @param parameters: parameters for the classifier
     @type output: string
     @param output: the name of the predictions file to be written
     @type forceInternal: Boolean
     @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
     """
     if type(parameters) == types.StringType:
         parameters = splitParameters(parameters)
     timer = Timer()
     if type(examples) == types.ListType:
         print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath
         examples, predictions = self.filterClassificationSet(examples, False)
         testPath = self.tempDir+"/test.dat"
         Example.writeExamples(examples, testPath)
     else:
         print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
         testPath = examples
         examples = Example.readExamples(examples,False)
     if parameters != None:
         parameters = copy.copy(parameters)
         if parameters.has_key("c"):
             del parameters["c"]
         if parameters.has_key("predefined"):
             parameters = copy.copy(parameters)
             modelPath = os.path.join(parameters["predefined"][0],"classifier/model")
             del parameters["predefined"]
     # Read model
     if modelPath == None:
         modelPath = "model-multilabel"
     classModels = {}
     if modelPath.endswith(".gz"):
         f = gzip.open(modelPath, "rt")
     else:
         f = open(modelPath, "rt")
     thresholds = {}
     for line in f:
         key, value, threshold = line.split()
         classModels[key] = value
         if threshold != "None":
             thresholds[key] = float(threshold)
         else:
             thresholds[key] = 0.0
     f.close()
     mergedPredictions = []
     if type(classIds) == types.StringType:
         classIds = IdSet(filename=classIds)
     #print classModels
     print "Thresholds", thresholds
     classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify"
     print parameters
     if "classifier" in parameters and "svmperf" in parameters["classifier"]:
         classifierBin = Settings.SVMPerfDir+"/svm_perf_classify"
         parameters = copy.copy(parameters)
         del parameters["classifier"]
     for className in classIds.getNames():
         if className != "neg" and not "---" in className:
             classId = classIds.getId(className)
             if thresholds[str(className)] != 0.0:
                 print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)]
             else:
                 print >> sys.stderr, "Classifying", className
             args = [classifierBin]
             #self.__addParametersToSubprocessCall(args, parameters)
             classOutput = "predictions" + ".cls-" + className
             logFile = open("svmmulticlass" + ".cls-" + className + ".log","at")
             args += [testPath, classModels[str(className)], classOutput]
             print args
             subprocess.call(args, stdout = logFile, stderr = logFile)
             cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)])
     print >> sys.stderr, timer.toString()
     
     predFileName = output
     f = open(predFileName, "wt")
     for mergedPred in mergedPredictions:
         if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
             mergedPred[0].remove("1")
         mergedPred[1] = str(mergedPred[1])
         mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
         f.write(" ".join(mergedPred) + "\n")
     f.close()
     
     return mergedPredictions
示例#33
0
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1              
            if differ:
                print
    counter.endUpdate()
示例#34
0
    def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
        
        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        
        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures",
            "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features",
            "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", 
            "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", 
            "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
            "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only",
            "entity_type"
        ])
        if style == None: # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups
        
        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
			self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert(self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#35
0
    def test(cls,
             examples,
             modelPath,
             output=None,
             parameters=None,
             forceInternal=False,
             classIds=None):  # , timeout=None):
        """
        Classify examples with a pre-trained model.
        
        @type examples: string (filename) or list (or iterator) of examples
        @param examples: a list or file containing examples in SVM-format
        @type modelPath: string
        @param modelPath: filename of the pre-trained model file
        @type parameters: a dictionary or string
        @param parameters: parameters for the classifier
        @type output: string
        @param output: the name of the predictions file to be written
        @type forceInternal: Boolean
        @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py
        """
        if type(parameters) == types.StringType:
            parameters = splitParameters(parameters)
        timer = Timer()
        if type(examples) == types.ListType:
            print >> sys.stderr, "Classifying", len(
                examples), "with SVM-MultiClass model", modelPath
            examples, predictions = self.filterClassificationSet(
                examples, False)
            testPath = self.tempDir + "/test.dat"
            Example.writeExamples(examples, testPath)
        else:
            print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath
            testPath = examples
            examples = Example.readExamples(examples, False)
        if parameters != None:
            parameters = copy.copy(parameters)
            if parameters.has_key("c"):
                del parameters["c"]
            if parameters.has_key("predefined"):
                parameters = copy.copy(parameters)
                modelPath = os.path.join(parameters["predefined"][0],
                                         "classifier/model")
                del parameters["predefined"]
        # Read model
        if modelPath == None:
            modelPath = "model-multilabel"
        classModels = {}
        if modelPath.endswith(".gz"):
            f = gzip.open(modelPath, "rt")
        else:
            f = open(modelPath, "rt")
        thresholds = {}
        for line in f:
            key, value, threshold = line.split()
            classModels[key] = value
            if threshold != "None":
                thresholds[key] = float(threshold)
            else:
                thresholds[key] = 0.0
        f.close()
        mergedPredictions = []
        if type(classIds) == types.StringType:
            classIds = IdSet(filename=classIds)
        #print classModels
        print "Thresholds", thresholds
        classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify"
        print parameters
        if "classifier" in parameters and "svmperf" in parameters["classifier"]:
            classifierBin = Settings.SVMPerfDir + "/svm_perf_classify"
            parameters = copy.copy(parameters)
            del parameters["classifier"]
        for className in classIds.getNames():
            if className != "neg" and not "---" in className:
                classId = classIds.getId(className)
                if thresholds[str(className)] != 0.0:
                    print >> sys.stderr, "Classifying", className, "with threshold", thresholds[
                        str(className)]
                else:
                    print >> sys.stderr, "Classifying", className
                args = [classifierBin]
                #self.__addParametersToSubprocessCall(args, parameters)
                classOutput = "predictions" + ".cls-" + className
                logFile = open("svmmulticlass" + ".cls-" + className + ".log",
                               "at")
                args += [testPath, classModels[str(className)], classOutput]
                print args
                subprocess.call(args, stdout=logFile, stderr=logFile)
                cls.addPredictions(classOutput,
                                   mergedPredictions,
                                   classId,
                                   len(classIds.Ids),
                                   threshold=thresholds[str(className)])
        print >> sys.stderr, timer.toString()

        predFileName = output
        f = open(predFileName, "wt")
        for mergedPred in mergedPredictions:
            if len(mergedPred[0]) > 1 and "1" in mergedPred[0]:
                mergedPred[0].remove("1")
            mergedPred[1] = str(mergedPred[1])
            mergedPred[0] = ",".join(sorted(list(mergedPred[0])))
            f.write(" ".join(mergedPred) + "\n")
        f.close()

        return mergedPredictions
def compareExamples(examples1, examples2, features1, features2=None):
    ExampleUtils.readExamples(examples1)
    exampleIter1 = ExampleUtils.readExamples(examples1)
    exampleIter2 = ExampleUtils.readExamples(examples2)
    features1 = IdSet(filename=features1)
    if features2 != None:
        features2 = IdSet(filename=features2)
    else:
        features2 = features1
    # Compare feature sets
    if set(features1.Ids.keys()) != set(features2.Ids.keys()):
        print "Feature sets differ"
    # Compare examples
    counter = ProgressCounter(step=1)
    for e1, e2 in itertools.izip(exampleIter1, exampleIter2):
        counter.update()
        assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2))
        if e1[1] != e2[1]:
            print "Class differs"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
        f1 = getFeatureNames(e1, features1)
        f2 = getFeatureNames(e2, features2)
        f1Set = set(f1)
        f2Set = set(f2)
        f1Only = f1Set.difference(f2Set)
        f2Only = f2Set.difference(f1Set)
        if len(f1Only) > 0 or len(f2Only) > 0:
            print "Features differ"
            print "  E1", removeFeatures(e1)
            print "  E2", removeFeatures(e2)
            if len(f1Only) > 0:
                print "  E1-only features:", f1Only
            if len(f2Only) > 0:
                print "  E2-only features:", f2Only
        else:
            assert len(f1) == len(f2)
            fCount = 0
            differ = False
            for feature1, feature2 in zip(f1, f2):
                #f1Id = features1.getId(feature1, createIfNotExist=False)
                #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation":
                #    print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id]
                if feature1 != feature2:
                    if not differ:
                        print "Feature order differs for example", e1[0]
                        differ = True
                    print "[" + feature1 + "/" + feature2 + "](" + str(
                        fCount) + ") ",
                else:
                    f1Id = features1.getId(feature1, createIfNotExist=False)
                    f2Id = features2.getId(feature2, createIfNotExist=False)
                    f1Value = e1[2][f1Id]
                    f2Value = e2[2][f2Id]
                    if f1Value != f2Value:
                        if not differ:
                            print "Feature values differ", e1[0]
                            differ = True
                        print "[" + feature1 + "/" + str(
                            f1Id) + "]" + "[" + str(f1Value) + "/" + str(
                                f2Value) + "]" + "(" + str(fCount) + ") ",
                fCount += 1
            if differ:
                print
    counter.endUpdate()
示例#37
0
class SingleEdgeExampleBuilder(ExampleBuilder):
    """
    Builds examples based on parse dependencies. An example is generated for each dependency. 
    If there is an annotated interaction edge between those tokens, then the example is positive,
    otherwise negative. Optionally examples can be generated only between tokens that are heads
    of entities.
    """
    def __init__(self, style):
        ExampleBuilder.__init__(self)
        self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
        self.style = style
        if not "binary" in style:
            self.classSet = IdSet(1)
            assert (self.classSet.getId("neg") == 1)

    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        dependencyEdges = sentenceGraph.dependencyGraph.edges()
        for depEdge in dependencyEdges:
            if "headsOnly" in self.style:
                if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or (
                        sentenceGraph.tokenIsEntityHead[depEdge[1]] == None):
                    continue

            edgeFound = False
            if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]):
                intEdges = sentenceGraph.interactionGraph.get_edge(
                    depEdge[0], depEdge[1])
                for intEdge in intEdges:
                    examples.append(
                        self.buildExample(depEdge, intEdge, False,
                                          exampleIndex, sentenceGraph))
                    exampleIndex += 1
                    edgeFound = True
            elif "directed" in self.style:
                examples.append(
                    self.buildExample(depEdge, None, None, exampleIndex,
                                      sentenceGraph))
                exampleIndex += 1
            if sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]):
                intEdges = sentenceGraph.interactionGraph.get_edge(
                    depEdge[1], depEdge[0])
                for intEdge in intEdges:
                    examples.append(
                        self.buildExample(depEdge, intEdge, True, exampleIndex,
                                          sentenceGraph))
                    exampleIndex += 1
                    edgeFound = True
            elif "directed" in self.style:
                examples.append(
                    self.buildExample(depEdge, None, None, exampleIndex,
                                      sentenceGraph))
                exampleIndex += 1

            if (not edgeFound) and (not "directed" in self.style):
                examples.append(
                    self.buildExample(depEdge, None, None, exampleIndex,
                                      sentenceGraph))
                exampleIndex += 1

        return examples

    def buildExample(self, depEdge, intEdge, isReverse, exampleIndex,
                     sentenceGraph):
        if "binary" in self.style:
            categoryName = "i"
            if intEdge != None:
                category = 1
            else:
                category = -1
        else:
            if intEdge != None:
                categoryName = intEdge.attrib["type"]
                if isReverse and "directed" in self.style:
                    categoryName += "_rev"
                category = self.classSet.getId(categoryName)
            else:
                categoryName = "neg"
                category = 1

        features = self.buildFeatures(depEdge, sentenceGraph)

        # Define extra attributes f.e. for the visualizer
        if int(depEdge[0].attrib["id"].split("_")[-1]) < int(
                depEdge[1].attrib["id"].split("_")[-1]):
            extra = {
                "xtype": "edge",
                "type": categoryName,
                "t1": depEdge[0],
                "t2": depEdge[1]
            }
            extra["deprev"] = False
        else:
            extra = {
                "xtype": "edge",
                "type": categoryName,
                "t1": depEdge[1],
                "t2": depEdge[0]
            }
            extra["deprev"] = True
        return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                category, features, extra)

    def buildFeatures(self, depEdge, sentenceGraph):
        features = {}
        self.featureBuilder.setFeatureVector(features)
        self.featureBuilder.buildEdgeFeatures(depEdge,
                                              sentenceGraph,
                                              "dep_",
                                              text=True,
                                              POS=True,
                                              annType=True,
                                              maskNames=True)
        self.featureBuilder.buildAttachedEdgeFeatures(depEdge,
                                                      sentenceGraph,
                                                      "",
                                                      text=False,
                                                      POS=True,
                                                      annType=False,
                                                      maskNames=True)
        self.featureBuilder.buildLinearOrderFeatures(depEdge)
        self.featureBuilder.setFeatureVector(None)
        return features
triggerClasses = ["Binding", 
       "Gene_expression", 
       "Localization", 
       "Negative_regulation", 
       "Phosphorylation", 
       "Positive_regulation", 
       "Protein_catabolism", 
       "Regulation", 
       "Transcription"]

classSet = IdSet(filename=TRIGGER_IDS+".class_names")
for triggerClass in triggerClasses:
    makeOneClassExamples(TRIGGER_TRAIN_EXAMPLE_FILE, TRIGGER_TRAIN_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet)
    makeOneClassExamples(TRIGGER_TEST_EXAMPLE_FILE, TRIGGER_TEST_EXAMPLE_FILE + "-" + triggerClass, triggerClass, classSet)
    d = {"neg":1, triggerClass:classSet.getId(triggerClass, False)}
    triggerClassIds = IdSet(idDict = d)
    TRIGGER_CLASS_IDS = "trigger-ids-"+triggerClass+".class_names"
    triggerClassIds.write(TRIGGER_CLASS_IDS)

    print >> sys.stderr, "Trigger models for parse", PARSE_TAG, "for class", triggerClass
    TRIGGER_CLASSIFIER_PARAMS="c:" + options.triggerParams
    if "local" not in options.csc:
        clear = False
        if "clear" in options.csc: clear = True
        if "louhi" in options.csc:
            c = CSCConnection(CSC_WORKDIR+"/trigger-models-"+triggerClass, "*****@*****.**", clear)
        else:
            c = CSCConnection(CSC_WORKDIR+"/trigger-models-"+triggerClass, "*****@*****.**", clear)
    else:
        c = None
示例#39
0
class ExampleBuilder:
    structureAnalyzer = None
    """ 
    ExampleBuilder is the abstract base class for specialized example builders.
    Example builders take some data and convert it to examples usable by e.g. SVMs.
    An example builder writes three files, an example-file (in extended Joachim's
    SVM format) and .class_names and .feature_names files, which contain the names
    for the class and feature id-numbers. An example builder can also be given
    pre-existing sets of class and feature ids (optionally in files) so that the
    generated examples are consistent with other, previously generated examples.
    """    
    def __init__(self, classSet=None, featureSet=None):
        if(type(classSet) == types.StringType):
            self.classSet = IdSet(filename=classSet)
        else:
            self.classSet = classSet
        
        if(type(featureSet) == types.StringType):
            self.featureSet = IdSet(filename=featureSet)
        else:
            self.featureSet = featureSet
        
        self.featureTag = ""      
        self.exampleStats = ExampleStats()
        self.parse = None
        self.tokenization = None
        #self.idFileTag = None
        self.classIdFilename = None
        self.featureIdFilename = None
        
        self.styles = None
        self._defaultParameters = None
        self._parameterValueLimits = None
        self._setDefaultParameters(["sentenceLimit"])
        self.debug = False
    
    def _setDefaultParameters(self, defaults=None, valueLimits=None):
        # Initialize
        if self._defaultParameters == None:
            self._defaultParameters = {}
        if self._parameterValueLimits == None:
            self._parameterValueLimits = {}
        newParameters = Utils.Parameters.get({}, defaults, valueLimits=valueLimits)
        self._defaultParameters.update(newParameters)
        if valueLimits != None:
            self._parameterValueLimits.update(valueLimits)
    
    def getParameters(self, parameters):
        return Utils.Parameters.get(parameters, defaults=self._defaultParameters, valueLimits=self._parameterValueLimits)
    
    def setFeature(self, name, value):
        self.features[self.featureSet.getId(self.featureTag+name)] = value
    
    def getElementCounts(self, filename):
        print >> sys.stderr, "Counting elements:",
        if filename.endswith(".gz"):
            f = gzip.open(filename, "rt")
        else:
            f = open(filename, "rt")
        counts = {"documents":0, "sentences":0}
        for line in f:
            if "<document" in line:
                counts["documents"] += 1
            elif "<sentence" in line:
                counts["sentences"] += 1
        f.close()
        print >> sys.stderr, counts
        return counts

    def saveIds(self):
        if self.classIdFilename != None:
            print >> sys.stderr, "Saving class names to", self.classIdFilename
            self.classSet.write(self.classIdFilename)
        else:
            print >> sys.stderr, "Class names not saved"
        if self.featureIdFilename != None:
            print >> sys.stderr, "Saving feature names to", self.featureIdFilename
            self.featureSet.write(self.featureIdFilename)
        else:
            print >> sys.stderr, "Feature names not saved"

    def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None):
        # Create intermediate paths if needed
        if os.path.dirname(output) != "" and not os.path.exists(os.path.dirname(output)):
            os.makedirs(os.path.dirname(output))
        # Open output file
        openStyle = "wt"
        if append:
            #print "Appending examples"
            openStyle = "at"
        if output.endswith(".gz"):
            outfile = gzip.open(output, openStyle)
        else:
            outfile = open(output, openStyle)
        
        # Build examples
        self.exampleCount = 0
        if type(input) in types.StringTypes:
            self.elementCounts = self.getElementCounts(input)
            if self.elementCounts["sentences"] > 0:
                self.progress = ProgressCounter(self.elementCounts["sentences"], "Build examples")
            else:
                self.elementCounts = None
                self.progress = ProgressCounter(None, "Build examples")
        else:
            self.elementCounts = None
            self.progress = ProgressCounter(None, "Build examples")
        
        self.calculatePredictedRange(self.getSentences(input, self.parse, self.tokenization))
        
        removeIntersentenceInteractions = True
        if "keep_intersentence" in self.styles and self.styles["keep_intersentence"]:
            print >> sys.stderr, "Keeping intersentence interactions for input corpus"
            removeIntersentenceInteractions = False
        inputIterator = getCorpusIterator(input, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeIntersentenceInteractions)            
        
        #goldIterator = []
        if gold != None:
            removeGoldIntersentenceInteractions = True
            if "keep_intersentence_gold" in self.styles and self.styles["keep_intersentence_gold"]:
                print >> sys.stderr, "Keeping intersentence interactions for gold corpus"
                removeGoldIntersentenceInteractions = False
            goldIterator = getCorpusIterator(gold, None, self.parse, self.tokenization, removeIntersentenceInteractions=removeGoldIntersentenceInteractions)
            for inputSentences, goldSentences in itertools.izip_longest(inputIterator, goldIterator, fillvalue=None):
                assert inputSentences != None
                assert goldSentences != None
                self.processDocument(inputSentences, goldSentences, outfile, structureAnalyzer=structureAnalyzer)
        else:
            for inputSentences in inputIterator:
                self.processDocument(inputSentences, None, outfile, structureAnalyzer=structureAnalyzer)
        outfile.close()
        self.progress.endUpdate()
        
        # Show statistics
        print >> sys.stderr, "Examples built:", self.exampleCount
        print >> sys.stderr, "Features:", len(self.featureSet.getNames())
        print >> sys.stderr, "Style:", Utils.Parameters.toString(self.getParameters(self.styles))
        if self.exampleStats.getExampleCount() > 0:
            self.exampleStats.printStats()
    
        # Save Ids
        if allowNewIds:
            self.saveIds()
    
    def processDocument(self, sentences, goldSentences, outfile, structureAnalyzer=None):
        #calculatePredictedRange(self, sentences)            
        for i in range(len(sentences)):
            sentence = sentences[i]
            goldSentence = None
            if goldSentences != None:
                goldSentence = goldSentences[i]
            self.progress.update(1, "Building examples ("+sentence.sentence.get("id")+"): ")
            self.processSentence(sentence, outfile, goldSentence, structureAnalyzer=structureAnalyzer)
    
    def processSentence(self, sentence, outfile, goldSentence=None, structureAnalyzer=None):
        # Process filtering rules
        if self.styles["sentenceLimit"]: # Rules for limiting which sentences to process
            # Get the rule list
            limitRules = self.styles["sentenceLimit"]
            if type(limitRules) in types.StringTypes:
                limitRules = [limitRules]
            # Get the list of sentence element attribute names
            sentenceElement = sentence.sentence
            sentenceAttributes = sorted(sentenceElement.attrib.keys())
            # Filter sentences based on matching rules to their attribute values
            for rule in limitRules:
                for sentAttr in sentenceAttributes:
                    # Rule are of the form "attr.value" where "attr" is the name
                    # of the attribute to match, and "value" a substring within
                    # that attribute
                    if rule.startswith(sentAttr + "."): # rule matches the attribute
                        value = rule.split(".", 1)[-1] # get the value part of the rule
                        if value not in sentenceElement.get(sentAttr): # rule value must be a substring of the attribute value
                            return # discard all sentences that do not match all rules
        # Process the sentence
        if sentence.sentenceGraph != None:
            goldGraph = None
            if goldSentence != None:
                goldGraph = goldSentence.sentenceGraph
            self.exampleCount += self.buildExamplesFromGraph(sentence.sentenceGraph, outfile, goldGraph, structureAnalyzer=structureAnalyzer)

    @classmethod
    def run(cls, input, output, parse, tokenization, style, classIds=None, featureIds=None, gold=None, append=False, allowNewIds=True, structureAnalyzer=None, debug=False):
        print >> sys.stderr, "Running", cls.__name__
        print >> sys.stderr, "  input:", input
        if gold != None:
            print >> sys.stderr, "  gold:", gold
        print >> sys.stderr, "  output:", output, "(append:", str(append) + ")"
        print >> sys.stderr, "  add new class/feature ids:", allowNewIds
        if not isinstance(style, types.StringTypes):
            style = Utils.Parameters.toString(style)
        print >> sys.stderr, "  style:", style
        if tokenization == None: 
            print >> sys.stderr, "  parse:", parse
        else:
            print >> sys.stderr, "  parse:", parse + ", tokenization:", tokenization
        classSet, featureSet = cls.getIdSets(classIds, featureIds, allowNewIds) #cls.getIdSets(idFileTag)
        builder = cls(style=style, classSet=classSet, featureSet=featureSet)
        builder.debug = debug
        #builder.idFileTag = idFileTag
        builder.classIdFilename = classIds
        builder.featureIdFilename = featureIds
        builder.parse = parse ; builder.tokenization = tokenization
        builder.processCorpus(input, output, gold, append=append, allowNewIds=allowNewIds, structureAnalyzer=structureAnalyzer)
        return builder

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        raise NotImplementedError
    
    def definePredictedValueRange(self, sentences, elementName):
        pass
    
    def getPredictedValueRange(self):
        return None
    
    @classmethod
    def getIdSets(self, classIds=None, featureIds=None, allowNewIds=True):
        # Class ids
        #print classIds
        #print featureIds
        if classIds != None and os.path.exists(classIds):
            print >> sys.stderr, "Using predefined class names from", classIds
            classSet = IdSet(allowNewIds=allowNewIds)
            classSet.load(classIds)
        else:
            print >> sys.stderr, "No predefined class names"
            classSet = None
        # Feature ids
        if featureIds != None and os.path.exists(featureIds):
            print >> sys.stderr, "Using predefined feature names from", featureIds
            featureSet = IdSet(allowNewIds=allowNewIds)
            featureSet.load(featureIds)
        else:
            print >> sys.stderr, "No predefined feature names"
            featureSet = None
        return classSet, featureSet
        
#        if idFileTag != None and os.path.exists(idFileTag + ".feature_names.gz") and os.path.exists(idFileTag + ".class_names"):
#            print >> sys.stderr, "Using predefined class and feature names"
#            featureSet = IdSet()
#            featureSet.load(idFileTag + ".feature_names.gz")
#            classSet = IdSet()
#            classSet.load(idFileTag + ".class_names")
#            return classSet, featureSet
#        else:
#            print >> sys.stderr, "No predefined class or feature-names"
#            if idFileTag != None:
#                assert(not os.path.exists(idFileTag + ".feature_names.gz")), idFileTag
#                assert(not os.path.exists(idFileTag + ".class_names")), idFileTag
#            return None, None


    def getSentences(self, input, parse, tokenization, removeNameInfo=False):
        if type(input) != types.ListType:
            # Load corpus and make sentence graphs
            corpusElements = Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo)
            sentences = []
            for sentence in corpusElements.sentences:
                if sentence.sentenceGraph != None: # required for event detection
                    sentences.append( [sentence.sentenceGraph,None] )
            return sentences
        else: # assume input is already a list of sentences
            assert(removeNameInfo == False)
            return input

    def calculatePredictedRange(self, sentences):
        print >> sys.stderr, "Defining predicted value range:",
        sentenceElements = []
        for sentence in sentences:
            sentenceElements.append(sentence[0].sentenceElement)
        self.definePredictedValueRange(sentenceElements, "entity")
        print >> sys.stderr, self.getPredictedValueRange()
示例#40
0
    
    defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml"
    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE")
    (options, args) = optparser.parse_args()
    
    #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt"))
    variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples"))
    
    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names"))
    
    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
        newFeatures = {}
        for k,v in example[2].iteritems():
            newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v
        example[2] = newFeatures
        
    ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
triggerClasses = [
    "Binding", "Gene_expression", "Localization", "Negative_regulation",
    "Phosphorylation", "Positive_regulation", "Protein_catabolism",
    "Regulation", "Transcription"
]

classSet = IdSet(filename=TRIGGER_IDS + ".class_names")
for triggerClass in triggerClasses:
    makeOneClassExamples(TRIGGER_TRAIN_EXAMPLE_FILE,
                         TRIGGER_TRAIN_EXAMPLE_FILE + "-" + triggerClass,
                         triggerClass, classSet)
    makeOneClassExamples(TRIGGER_TEST_EXAMPLE_FILE,
                         TRIGGER_TEST_EXAMPLE_FILE + "-" + triggerClass,
                         triggerClass, classSet)
    d = {"neg": 1, triggerClass: classSet.getId(triggerClass, False)}
    triggerClassIds = IdSet(idDict=d)
    TRIGGER_CLASS_IDS = "trigger-ids-" + triggerClass + ".class_names"
    triggerClassIds.write(TRIGGER_CLASS_IDS)

    print >> sys.stderr, "Trigger models for parse", PARSE_TAG, "for class", triggerClass
    TRIGGER_CLASSIFIER_PARAMS = "c:" + options.triggerParams
    if "local" not in options.csc:
        clear = False
        if "clear" in options.csc: clear = True
        if "louhi" in options.csc:
            c = CSCConnection(CSC_WORKDIR + "/trigger-models-" + triggerClass,
                              "*****@*****.**", clear)
        else:
            c = CSCConnection(CSC_WORKDIR + "/trigger-models-" + triggerClass,
                              "*****@*****.**", clear)
示例#42
0
    variantExamples = ExampleUtils.readExamples(
        os.path.join(options.variant, "test-triggers.examples"))

    invariantFeatureSet = IdSet()
    invariantFeatureSet.load(
        os.path.join(options.invariant, "feature_names.txt"))
    invariantClassSet = IdSet()
    invariantClassSet.load(os.path.join(options.invariant, "class_names.txt"))

    variantFeatureSet = IdSet()
    variantFeatureSet.load(
        os.path.join(options.variant, "test-triggers.examples.feature_names"))
    variantClassSet = IdSet()
    variantClassSet.load(
        os.path.join(options.variant, "test-triggers.examples.class_names"))

    counter = ProgressCounter(len(variantExamples))
    for example in variantExamples:
        counter.update()
        example[1] = invariantClassSet.getId(
            variantClassSet.getName(example[1]))
        newFeatures = {}
        for k, v in example[2].iteritems():
            newFeatures[invariantFeatureSet.getId(
                variantFeatureSet.getName(k))] = v
        example[2] = newFeatures

    ExampleUtils.writeExamples(
        variantExamples, os.path.join(options.variant,
                                      "realignedExamples.txt"))
class TokenRoleMultiEdgeTypeExampleBuilder(ExampleBuilder):
    def __init__(self):
        ExampleBuilder.__init__(self)
        self.classSet = IdSet(1)
        assert (self.classSet.getId("neg") == 1)

    # Results slightly nondeterministic because when there are multiple edges between two
    # tokens, this currently returns only one, and their order is not defined.
    def getEdges(self, graph, path):
        pathEdges = []
        edges = graph.edges()
        for i in range(1, len(path)):
            found = False
            for edge in edges:
                if edge[0] == path[i - 1] and edge[1] == path[i]:
                    pathEdges.append((edge, True))
                    found = True
                elif edge[1] == path[i - 1] and edge[0] == path[i]:
                    pathEdges.append((edge, False))
                    found = True
                if found == True:
                    break
            assert (found == True)
        return pathEdges

    def addType(self, token, features, sentenceGraph, prefix="annType_"):
        if sentenceGraph.tokenIsEntityHead[token] != None:
            features[self.featureSet.getId(
                "annType_" +
                sentenceGraph.tokenIsEntityHead[token].attrib["type"])] = 1

    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        undirected = sentenceGraph.dependencyGraph.to_undirected()
        #undirected = self.makeUndirected(sentenceGraph.dependencyGraph)
        paths = NX.all_pairs_shortest_path(undirected, cutoff=4)
        for i in range(len(sentenceGraph.tokens) - 1):
            for j in range(i + 1, len(sentenceGraph.tokens)):
                tI = sentenceGraph.tokens[i]
                tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if (sentenceGraph.tokenIsEntityHead[tI]
                        == None) or (sentenceGraph.tokenIsEntityHead[tJ]
                                     == None):
                    continue
                # find the path
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                elif paths.has_key(tJ) and paths[tJ].has_key(tI):
                    path = paths[tJ][tI]
                else:
                    continue
                if len(path) > 1:  #> 2:
                    # define class
                    if sentenceGraph.interactionGraph.has_edge(
                            path[0], path[-1]):
                        categoryName = sentenceGraph.interactionGraph.get_edge(
                            path[0], path[-1]).attrib["type"]
                        self.buildExample(path, sentenceGraph, categoryName,
                                          examples, exampleIndex)
                        exampleIndex += 1
                    else:
                        self.buildExample(path, sentenceGraph, "neg", examples,
                                          exampleIndex)
                        exampleIndex += 1
                    if sentenceGraph.interactionGraph.has_edge(
                            path[-1], path[0]):
                        categoryName = sentenceGraph.interactionGraph.get_edge(
                            path[-1], path[0]).attrib["type"]
                        #categoryName += "_rev"
                        self.buildExample(path[::-1], sentenceGraph,
                                          categoryName, examples, exampleIndex)
                        exampleIndex += 1
                    else:
                        self.buildExample(path[::-1], sentenceGraph, "neg",
                                          examples, exampleIndex)
                        exampleIndex += 1
        return examples

    def buildExample(self, path, sentenceGraph, categoryName, examples,
                     exampleIndex):
        # define features
        features = {}
        edges = self.getEdges(sentenceGraph.dependencyGraph, path)
        features[self.featureSet.getId("len_edges_" + str(len(edges)))] = 1
        features[self.featureSet.getId("len")] = len(edges)
        self.buildPathRoleFeatures(path, edges, sentenceGraph, features)
        self.buildEdgeCombinations(edges, sentenceGraph, features)
        #self.buildTerminusFeatures(path[0], "t1", sentenceGraph, features)
        #self.buildTerminusFeatures(path[-1], "t2", sentenceGraph, features)
        for edge in edges:
            self.buildPathEdgeFeatures(edge[0], sentenceGraph, features)
#        if edges[0][0][0] == path[0]:
#            t1 = edges[0][0][0]
#        else:
#            t1 = edges[0][0][1]
#            assert(edges[0][0][1] == path[0])
#        if edges[-1][0][0] == path[-1]:
#            t2 = edges[-1][0][0]
#        else:
#            t2 = edges[-1][0][1]
#            assert(edges[-1][0][1] == path[-1])
#        self.buildEdgeCombinations(edges, sentenceGraph, features)
#        self.buildTerminusFeatures(t1, t2, sentenceGraph, features)
# define extra attributes
        if int(path[0].attrib["id"].split("_")[-1]) < int(
                path[-1].attrib["id"].split("_")[-1]):
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[0],
                "t2": path[-1]
            }
            extra["deprev"] = False
        else:
            extra = {
                "xtype": "edge",
                "type": "i",
                "t1": path[-1],
                "t2": path[0]
            }
            extra["deprev"] = True
        # make example
        category = self.classSet.getId(categoryName)
        examples.append(
            (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
             category, features, extra))

    def buildPathRoleFeatures(self, pathTokens, pathEdges, sentenceGraph,
                              features):
        #print len(pathTokens), len(pathEdges)
        features[self.featureSet.getId("tokTerm1POS_" +
                                       pathTokens[0].attrib["POS"])] = 1
        features[self.featureSet.getId(
            "tokTerm1txt_" + sentenceGraph.getTokenText(pathTokens[0]))] = 1
        features[self.featureSet.getId("tokTerm2POS_" +
                                       pathTokens[-1].attrib["POS"])] = 1
        features[self.featureSet.getId(
            "tokTerm2txt_" + sentenceGraph.getTokenText(pathTokens[-1]))] = 1
        #        for i in range(0,len(pathEdges)):
        #            if pathEdges[i][1]:
        #                features[self.featureSet.getId("depRight_"+pathEdges[i][0][2].attrib["type"])] = 1
        #            else:
        #                features[self.featureSet.getId("depLeft_"+pathEdges[i][0][2].attrib["type"])] = 1
        for i in range(1, len(pathEdges)):
            if pathEdges[i - 1][1] and pathEdges[i][1]:
                features[self.featureSet.getId(
                    "depRight1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "depRight2_" + pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "tokRightPOS_" + pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId(
                    "tokRightTxt_" +
                    sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif (not pathEdges[i - 1][1]) and (not pathEdges[i][1]):
                features[self.featureSet.getId(
                    "depLeft1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "depLeft2_" + pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "tokLeftPOS_" + pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId(
                    "tokLeftTxt_" +
                    sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif (not pathEdges[i - 1][1]) and pathEdges[i][1]:
                features[self.featureSet.getId(
                    "depTop1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "depTop2_" + pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "tokTopPOS_" + pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId(
                    "tokTopTxt_" +
                    sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif pathEdges[i - 1][1] and (not pathEdges[i][1]):
                features[self.featureSet.getId(
                    "depBottom1_" + pathEdges[i - 1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "depBottom2_" + pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId(
                    "tokBottomPOS_" + pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId(
                    "tokBottomTxt_" +
                    sentenceGraph.getTokenText(pathTokens[i]))] = 1

    def buildPathEdgeFeatures(self, depEdge, sentenceGraph, features):
        depType = depEdge[2].attrib["type"]
        features[self.featureSet.getId("dep_" + depType)] = 1
        # Token 1
        features[self.featureSet.getId(
            "txt_" + sentenceGraph.getTokenText(depEdge[0]))] = 1
        features[self.featureSet.getId("POS_" + depEdge[0].attrib["POS"])] = 1
        self.addType(depEdge[0], features, sentenceGraph, prefix="annType_")
        # Token 2
        features[self.featureSet.getId(
            "txt_" + sentenceGraph.getTokenText(depEdge[1]))] = 1
        features[self.featureSet.getId("POS_" + depEdge[1].attrib["POS"])] = 1
        self.addType(depEdge[1], features, sentenceGraph, prefix="annType_")

    def buildEdgeCombinations(self, edges, sentenceGraph, features):
        # Edges directed relative to the path
        for i in range(len(edges)):
            depType = edges[i][0][2].attrib["type"]
            if edges[i][1]:
                features[self.featureSet.getId("dep_" + depType + ">")] = 1
            else:
                features[self.featureSet.getId("dep_<" + depType)] = 1
        # Edge bigrams
        if edges[0][1]:
            features[self.featureSet.getId("internalPOS_" +
                                           edges[0][0][0].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[0][0][0]))] = 1
        else:
            features[self.featureSet.getId("internalPOS_" +
                                           edges[0][0][1].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[0][0][1]))] = 1
        if edges[-1][1]:
            features[self.featureSet.getId("internalPOS_" +
                                           edges[-1][0][1].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[-1][0][1]))] = 1
        else:
            features[self.featureSet.getId("internalPOS_" +
                                           edges[-1][0][0].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[-1][0][0]))] = 1
        for i in range(1, len(edges) - 1):
            features[self.featureSet.getId("internalPOS_" +
                                           edges[i][0][0].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[i][0][0]))] = 1
            features[self.featureSet.getId("internalPOS_" +
                                           edges[i][0][1].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "internalTxt_" +
                sentenceGraph.getTokenText(edges[i][0][1]))] = 1
            features[self.featureSet.getId("internalDep_" +
                                           edges[i][0][2].attrib["type"])] = 1
        for i in range(1, len(edges)):
            type1 = edges[i - 1][0][2].attrib["type"]
            type2 = edges[i][0][2].attrib["type"]
            if edges[i - 1][1] and edges[i][1]:
                features[self.featureSet.getId("dep_" + type1 + ">" + type2 +
                                               ">")] = 1
            elif edges[i - 1][1] and edges[i][0]:
                features[self.featureSet.getId("dep_" + type1 + ">" + type2 +
                                               "<")] = 1
            elif edges[i - 1][0] and edges[i][0]:
                features[self.featureSet.getId("dep_" + type1 + "<" + type2 +
                                               "<")] = 1
            elif edges[i - 1][0] and edges[i][1]:
                features[self.featureSet.getId("dep_" + type1 + "<" + type2 +
                                               ">")] = 1

    def buildTerminusFeatures(self, token, prefix, sentenceGraph, features):
        # Attached edges
        t1InEdges = sentenceGraph.dependencyGraph.in_edges(token)
        for edge in t1InEdges:
            features[self.featureSet.getId(prefix + "HangingIn_" +
                                           edge[2].attrib["type"])] = 1
            features[self.featureSet.getId(prefix + "HangingIn_" +
                                           edge[0].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "t1HangingIn_" + sentenceGraph.getTokenText(edge[0]))] = 1
        t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token)
        for edge in t1OutEdges:
            features[self.featureSet.getId(prefix + "HangingOut_" +
                                           edge[2].attrib["type"])] = 1
            features[self.featureSet.getId(prefix + "HangingOut_" +
                                           edge[1].attrib["POS"])] = 1
            features[self.featureSet.getId(
                "t1HangingOut_" + sentenceGraph.getTokenText(edge[1]))] = 1
示例#44
0
class SingleEdgeExampleBuilder(ExampleBuilder):
    """
    Builds examples based on parse dependencies. An example is generated for each dependency. 
    If there is an annotated interaction edge between those tokens, then the example is positive,
    otherwise negative. Optionally examples can be generated only between tokens that are heads
    of entities.
    """
    def __init__(self, style):
        ExampleBuilder.__init__(self)
        self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
        self.style = style
        if not "binary" in style:
            self.classSet = IdSet(1)
            assert( self.classSet.getId("neg") == 1 )
        
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        dependencyEdges = sentenceGraph.dependencyGraph.edges()
        for depEdge in dependencyEdges:
            if "headsOnly" in self.style:
                if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or (sentenceGraph.tokenIsEntityHead[depEdge[1]] == None):
                    continue
            
            edgeFound = False
            if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]):
                intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[0], depEdge[1])
                for intEdge in intEdges:
                    examples.append( self.buildExample(depEdge, intEdge, False, exampleIndex, sentenceGraph) )
                    exampleIndex += 1
                    edgeFound = True
            elif "directed" in self.style:
                examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) )
                exampleIndex += 1
            if sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]):
                intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[1], depEdge[0])
                for intEdge in intEdges:
                    examples.append( self.buildExample(depEdge, intEdge, True, exampleIndex, sentenceGraph) )
                    exampleIndex += 1
                    edgeFound = True
            elif "directed" in self.style:
                examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) )
                exampleIndex += 1
            
            if (not edgeFound) and (not "directed" in self.style):
                examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) )
                exampleIndex += 1

        return examples
    
    def buildExample(self, depEdge, intEdge, isReverse, exampleIndex, sentenceGraph):
        if "binary" in self.style:
            categoryName = "i"
            if intEdge != None:
                category = 1
            else:
                category = -1
        else:
            if intEdge != None:
                categoryName = intEdge.attrib["type"]
                if isReverse and "directed" in self.style:
                    categoryName += "_rev"
                category = self.classSet.getId(categoryName)
            else:
                categoryName = "neg"
                category = 1
        
        features = self.buildFeatures(depEdge,sentenceGraph)

        # Define extra attributes f.e. for the visualizer
        if int(depEdge[0].attrib["id"].split("_")[-1]) < int(depEdge[1].attrib["id"].split("_")[-1]):
            extra = {"xtype":"edge","type":categoryName,"t1":depEdge[0],"t2":depEdge[1]}
            extra["deprev"] = False
        else:
            extra = {"xtype":"edge","type":categoryName,"t1":depEdge[1],"t2":depEdge[0]}
            extra["deprev"] = True
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)

    def buildFeatures(self, depEdge, sentenceGraph):
        features = {}
        self.featureBuilder.setFeatureVector(features)
        self.featureBuilder.buildEdgeFeatures(depEdge, sentenceGraph, "dep_", text=True, POS=True, annType=True, maskNames=True)
        self.featureBuilder.buildAttachedEdgeFeatures(depEdge, sentenceGraph, "", text=False, POS=True, annType=False, maskNames=True)       
        self.featureBuilder.buildLinearOrderFeatures(depEdge)
        self.featureBuilder.setFeatureVector(None)
        return features
    def __init__(self,
                 style=["typed", "directed", "headsOnly"],
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None,
                 gazetteer=None,
                 pathGazetteer=None,
                 negFrac=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1)

        if gazetteer != None:
            print >> sys.stderr, "Loading gazetteer from", gazetteer
            self.gazetteer = Gazetteer.loadGztr(gazetteer)
        else:
            print >> sys.stderr, "No gazetteer loaded"
            self.gazetteer = None

        self.pathGazetteer = None
        self.pathGazetteerDependencies = None
        self.pathGazetteerPairs = None
        if pathGazetteer != None:
            print >> sys.stderr, "Loading path gazetteer from", pathGazetteer
            self.pathGazetteer = PathGazetteer.load(pathGazetteer)
            self.pathGazetteerDependencies = PathGazetteer.getDependencies(
                self.pathGazetteer)
            self.pathGazetteerPairs = PathGazetteer.getPairs(
                self.pathGazetteer)
        else:
            print >> sys.stderr, "No path gazetteer loaded"

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
        self.styles = style
        self.negFrac = negFrac
        print >> sys.stderr, "Downsampling negatives to", negFrac
        self.negRand = random.Random()

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        if True:  #"noAnnType" in self.styles:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if "noMasking" in self.styles:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if "maxFeatures" in self.styles:
            self.multiEdgeFeatureBuilder.maximum = True

        self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
        #self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        #if "ontology" in self.styles:
        #    self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types

        self.eventsByOrigId = {}
        self.headTokensByOrigId = {}
        self.interSentenceEvents = set()

        self.examplesByEventOrigId = {}
        self.skippedByType = {}
        self.skippedByTypeAndReason = {}
        self.builtByType = {}

        self.gazMatchCache = {}
class SingleDependencyTypeExampleBuilder(ExampleBuilder):
    def __init__(self):
        ExampleBuilder.__init__(self)
        self.classSet = IdSet(1)
        assert( self.classSet.getId("neg") == 1 )
        self.featureBuilder = EdgeFeatureBuilder(self.featureSet)
        
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        dependencyEdges = sentenceGraph.dependencyGraph.edges()
        for depEdge in dependencyEdges:
            if (sentenceGraph.tokenIsEntityHead[depEdge[0]] == None) or (sentenceGraph.tokenIsEntityHead[depEdge[1]] == None):
                continue
            
            if sentenceGraph.interactionGraph.has_edge(depEdge[0], depEdge[1]):
                intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[0], depEdge[1])
                for intEdge in intEdges:
                    examples.append( self.buildExample(depEdge, intEdge, False, exampleIndex, sentenceGraph) )
                    exampleIndex += 1
            elif sentenceGraph.interactionGraph.has_edge(depEdge[1], depEdge[0]):
                intEdges = sentenceGraph.interactionGraph.get_edge(depEdge[1], depEdge[0])
                for intEdge in intEdges:
                    examples.append( self.buildExample(depEdge, intEdge, True, exampleIndex, sentenceGraph) )
                    exampleIndex += 1
            else:
                examples.append( self.buildExample(depEdge, None, None, exampleIndex, sentenceGraph) )
                exampleIndex += 1

        return examples
    
    def buildExample(self, depEdge, intEdge, isReverse, exampleIndex, sentenceGraph):
        if intEdge != None:
            categoryName = intEdge.attrib["type"]
            if isReverse:
                categoryName += "_rev"
            #categoryName += ">"
            #categoryName = "<" + categoryName
            category = self.classSet.getId(categoryName)
        else:
            categoryName = "neg"
            category = 1
        
        features = self.buildFeatures(depEdge,sentenceGraph)

        # Define extra attributes f.e. for the visualizer
        if int(depEdge[0].attrib["id"].split("_")[-1]) < int(depEdge[1].attrib["id"].split("_")[-1]):
            extra = {"xtype":"edge","type":categoryName,"t1":depEdge[0],"t2":depEdge[1]}
            extra["deprev"] = False
        else:
            extra = {"xtype":"edge","type":categoryName,"t1":depEdge[1],"t2":depEdge[0]}
            extra["deprev"] = True
        return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)

    def buildFeatures(self, depEdge, sentenceGraph):
        features = {}
        self.featureBuilder.setFeatureVector(features)
        self.featureBuilder.buildEdgeFeatures(depEdge, sentenceGraph, "dep_", text=True, POS=True, annType=True, maskNames=True)
        
        # Attached edges
        self.featureBuilder.buildAttachedEdgeFeatures(depEdge, sentenceGraph, "", text=False, POS=True, annType=False, maskNames=True)               
#        t1InEdges = sentenceGraph.dependencyGraph.in_edges(depEdge[0])
#        for edge in t1InEdges:
#            features[self.featureSet.getId("t1HangingIn_"+edge[2].attrib["type"])] = 1
#            features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["POS"])] = 1
#            self.addType(edge[0], features, sentenceGraph, prefix="t1HangingInAnn_")
#            #features[self.featureSet.getId("t1HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1
#        t1OutEdges = sentenceGraph.dependencyGraph.out_edges(depEdge[0])
#        for edge in t1OutEdges:
#            features[self.featureSet.getId("t1HangingOut_"+edge[2].attrib["type"])] = 1
#            features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["POS"])] = 1
#            self.addType(edge[1], features, sentenceGraph, prefix="t1HangingOutAnn_")
#            #features[self.featureSet.getId("t1HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1
#        
#        t2InEdges = sentenceGraph.dependencyGraph.in_edges(depEdge[1])
#        for edge in t2InEdges:
#            features[self.featureSet.getId("t2HangingIn_"+edge[2].attrib["type"])] = 1
#            features[self.featureSet.getId("t2HangingIn_"+edge[0].attrib["POS"])] = 1
#            self.addType(edge[0], features, sentenceGraph, prefix="t2HangingInAnn_")
#            #features[self.featureSet.getId("t2HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1
#        t2OutEdges = sentenceGraph.dependencyGraph.out_edges(depEdge[1])
#       for edge in t2OutEdges:
#            features[self.featureSet.getId("t2HangingOut_"+edge[2].attrib["type"])] = 1
#            features[self.featureSet.getId("t2HangingOut_"+edge[1].attrib["POS"])] = 1
#            self.addType(edge[1], features, sentenceGraph, prefix="t2HangingOutAnn_")
#            #features[self.featureSet.getId("t2HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1
        
        # Linear order
        self.featureBuilder.buildLinearOrderFeatures(depEdge)
        self.featureBuilder.setFeatureVector(None)
        return features
示例#47
0
    def __init__(self,
                 style=None,
                 length=None,
                 types=[],
                 featureSet=None,
                 classSet=None):
        if featureSet == None:
            featureSet = IdSet()
        if classSet == None:
            classSet = IdSet(1)
        else:
            classSet = classSet
        assert (classSet.getId("neg") == 1
                or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1))

        ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)

        self.styles = self.getParameters(style, [
            "typed", "directed", "headsOnly", "graph_kernel", "noAnnType",
            "noMasking", "maxFeatures", "genia_limits", "epi_limits",
            "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
            "genia_task1", "ontology", "nodalida", "bacteria_renaming",
            "trigger_features", "rel_features", "ddi_features", "evex",
            "giuliano", "random", "themeOnly", "causeOnly", "no_path",
            "entities", "skip_extra_triggers", "headsOnly", "graph_kernel",
            "trigger_features", "no_task", "no_dependency",
            "disable_entity_features", "disable_terminus_features",
            "disable_single_element_features", "disable_ngram_features",
            "disable_path_edge_features", "no_linear", "subset", "binary",
            "pos_only", "entity_type"
        ])
        if style == None:  # no parameters given
            style["typed"] = style["directed"] = style["headsOnly"] = True
#        self.styles = style
#        if "selftrain_group" in self.styles:
#            self.selfTrainGroups = set()
#            if "selftrain_group-1" in self.styles:
#                self.selfTrainGroups.add("-1")
#            if "selftrain_group0" in self.styles:
#                self.selfTrainGroups.add("0")
#            if "selftrain_group1" in self.styles:
#                self.selfTrainGroups.add("1")
#            if "selftrain_group2" in self.styles:
#                self.selfTrainGroups.add("2")
#            if "selftrain_group3" in self.styles:
#                self.selfTrainGroups.add("3")
#            print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups

        self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet)
        # NOTE Temporarily re-enabling predicted range
        #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
        if self.styles["graph_kernel"]:
            from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
            self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(
                self.featureSet)
        if self.styles["noAnnType"]:
            self.multiEdgeFeatureBuilder.noAnnType = True
        if self.styles["noMasking"]:
            self.multiEdgeFeatureBuilder.maskNamedEntities = False
        if self.styles["maxFeatures"]:
            self.multiEdgeFeatureBuilder.maximum = True
        if self.styles["genia_task1"]:
            self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
        self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
        if self.styles["ontology"]:
            self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(
                self.featureSet)
        if self.styles["nodalida"]:
            self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(
                self.featureSet)
        if self.styles["bacteria_renaming"]:
            self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(
                self.featureSet)
        if self.styles["trigger_features"]:
            self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet)
            self.triggerFeatureBuilder.useNonNameEntities = True
            if self.styles["genia_task1"]:
                self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
            #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
        if self.styles["rel_features"]:
            self.relFeatureBuilder = RELFeatureBuilder(featureSet)
        if self.styles["ddi_features"]:
            self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
        if self.styles["evex"]:
            self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
        if self.styles["giuliano"]:
            self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
        self.pathLengths = length
        assert (self.pathLengths == None)
        self.types = types
        if self.styles["random"]:
            from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
            self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
示例#48
0
 def __init__(self, style=None, types=[], featureSet=None, classSet=None):
     if featureSet == None:
         featureSet = IdSet()
     if classSet == None:
         classSet = IdSet(1)
     else:
         classSet = classSet
     
     ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet)
     assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) )
     
     # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures
     self._setDefaultParameters([
         "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features",
         "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits",
         "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features",
         "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", 
         "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", 
         "disable_entity_features", "disable_terminus_features", "disable_single_element_features", 
         "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only",
         "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", 
         "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities",
         "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra",
         "entity_extra"])
     self.styles = self.getParameters(style)
     #if style == None: # no parameters given
     #    style["typed"] = style["directed"] = style["headsOnly"] = True
     
     self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles)
     # NOTE Temporarily re-enabling predicted range
     #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None)
     if self.styles["graph_kernel"]:
         from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder
         self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet)
     if self.styles["noAnnType"]:
         self.multiEdgeFeatureBuilder.noAnnType = True
     if self.styles["mask_nodes"]:
         self.multiEdgeFeatureBuilder.maskNamedEntities = True
     else:
         self.multiEdgeFeatureBuilder.maskNamedEntities = False
     if not self.styles["limit_features"]:
         self.multiEdgeFeatureBuilder.maximum = True
     if self.styles["genia_task1"]:
         self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity")
     self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet)
     if self.styles["ontology"]:
         self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet)
     if self.styles["ontobiotope_features"]:
         self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet)
     if self.styles["nodalida"]:
         self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet)
     if self.styles["bacteria_renaming"]:
         self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet)
     if not self.styles["no_trigger_features"]:
         self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles)
         self.triggerFeatureBuilder.useNonNameEntities = True
         if self.styles["noAnnType"]:
             self.triggerFeatureBuilder.noAnnType = True
         if self.styles["genia_task1"]:
             self.triggerFeatureBuilder.filterAnnTypes.add("Entity")
         #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName)
     if self.styles["rel_features"]:
         self.relFeatureBuilder = RELFeatureBuilder(featureSet)
     if self.styles["drugbank_features"]:
         self.drugFeatureBuilder = DrugFeatureBuilder(featureSet)
     if self.styles["evex"]:
         self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet)
     if self.styles["wordnet"]:
         self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet)
     if self.styles["wordvector"]:
         self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles)
     if self.styles["giuliano"]:
         self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet)
     self.types = types
     if self.styles["random"]:
         from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder
         self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet)
class TokenRoleMultiEdgeTypeExampleBuilder(ExampleBuilder):
    def __init__(self):
        ExampleBuilder.__init__(self)
        self.classSet = IdSet(1)
        assert( self.classSet.getId("neg") == 1 )
    
    # Results slightly nondeterministic because when there are multiple edges between two
    # tokens, this currently returns only one, and their order is not defined.
    def getEdges(self, graph, path):
        pathEdges = []
        edges = graph.edges()
        for i in range(1, len(path)):
            found = False
            for edge in edges:
                if edge[0] == path[i-1] and edge[1] == path[i]:
                    pathEdges.append((edge, True))
                    found = True
                elif edge[1] == path[i-1] and edge[0] == path[i]:
                    pathEdges.append((edge, False))
                    found = True
                if found == True:
                    break
            assert(found==True)
        return pathEdges
    
    def addType(self, token, features, sentenceGraph, prefix="annType_"):
        if sentenceGraph.tokenIsEntityHead[token] != None:
            features[self.featureSet.getId("annType_"+sentenceGraph.tokenIsEntityHead[token].attrib["type"])] = 1
     
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0
        
        undirected = sentenceGraph.dependencyGraph.to_undirected()
        #undirected = self.makeUndirected(sentenceGraph.dependencyGraph)
        paths = NX.all_pairs_shortest_path(undirected, cutoff=4)
        for i in range(len(sentenceGraph.tokens)-1):
            for j in range(i+1,len(sentenceGraph.tokens)):
                tI = sentenceGraph.tokens[i]
                tJ = sentenceGraph.tokens[j]
                # only consider paths between entities (NOTE! entities, not only named entities)
                if (sentenceGraph.tokenIsEntityHead[tI] == None) or (sentenceGraph.tokenIsEntityHead[tJ] == None):
                    continue
                # find the path
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                elif paths.has_key(tJ) and paths[tJ].has_key(tI):
                    path = paths[tJ][tI]
                else:
                    continue
                if len(path) > 1:#> 2:
                    # define class
                    if sentenceGraph.interactionGraph.has_edge(path[0], path[-1]):
                        categoryName = sentenceGraph.interactionGraph.get_edge(path[0], path[-1]).attrib["type"]                      
                        self.buildExample(path, sentenceGraph, categoryName, examples, exampleIndex)
                        exampleIndex += 1
                    else:
                        self.buildExample(path, sentenceGraph, "neg", examples, exampleIndex)
                        exampleIndex += 1
                    if sentenceGraph.interactionGraph.has_edge(path[-1], path[0]):
                        categoryName = sentenceGraph.interactionGraph.get_edge(path[-1], path[0]).attrib["type"]
                        #categoryName += "_rev"
                        self.buildExample(path[::-1], sentenceGraph, categoryName, examples, exampleIndex)
                        exampleIndex += 1
                    else:
                        self.buildExample(path[::-1], sentenceGraph, "neg", examples, exampleIndex)
                        exampleIndex += 1
        return examples
    
    def buildExample(self, path, sentenceGraph, categoryName, examples, exampleIndex):
        # define features
        features = {}
        edges = self.getEdges(sentenceGraph.dependencyGraph, path)
        features[self.featureSet.getId("len_edges_"+str(len(edges)))] = 1
        features[self.featureSet.getId("len")] = len(edges)
        self.buildPathRoleFeatures(path, edges, sentenceGraph, features)
        self.buildEdgeCombinations(edges, sentenceGraph, features)
        #self.buildTerminusFeatures(path[0], "t1", sentenceGraph, features)
        #self.buildTerminusFeatures(path[-1], "t2", sentenceGraph, features)
        for edge in edges:
            self.buildPathEdgeFeatures(edge[0], sentenceGraph, features)
#        if edges[0][0][0] == path[0]:
#            t1 = edges[0][0][0]
#        else:
#            t1 = edges[0][0][1]
#            assert(edges[0][0][1] == path[0])
#        if edges[-1][0][0] == path[-1]:
#            t2 = edges[-1][0][0]
#        else:
#            t2 = edges[-1][0][1]
#            assert(edges[-1][0][1] == path[-1])
#        self.buildEdgeCombinations(edges, sentenceGraph, features)
#        self.buildTerminusFeatures(t1, t2, sentenceGraph, features)
        # define extra attributes              
        if int(path[0].attrib["id"].split("_")[-1]) < int(path[-1].attrib["id"].split("_")[-1]):
            extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]}
            extra["deprev"] = False
        else:
            extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]}
            extra["deprev"] = True
        # make example
        category = self.classSet.getId(categoryName)
        examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
    
    def buildPathRoleFeatures(self, pathTokens, pathEdges, sentenceGraph, features):
        #print len(pathTokens), len(pathEdges)
        features[self.featureSet.getId("tokTerm1POS_"+pathTokens[0].attrib["POS"])] = 1
        features[self.featureSet.getId("tokTerm1txt_"+sentenceGraph.getTokenText(pathTokens[0]))] = 1
        features[self.featureSet.getId("tokTerm2POS_"+pathTokens[-1].attrib["POS"])] = 1
        features[self.featureSet.getId("tokTerm2txt_"+sentenceGraph.getTokenText(pathTokens[-1]))] = 1
#        for i in range(0,len(pathEdges)):
#            if pathEdges[i][1]:
#                features[self.featureSet.getId("depRight_"+pathEdges[i][0][2].attrib["type"])] = 1
#            else:
#                features[self.featureSet.getId("depLeft_"+pathEdges[i][0][2].attrib["type"])] = 1
        for i in range(1,len(pathEdges)):
            if pathEdges[i-1][1] and pathEdges[i][1]:
                features[self.featureSet.getId("depRight1_"+pathEdges[i-1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("depRight2_"+pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("tokRightPOS_"+pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId("tokRightTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif (not pathEdges[i-1][1]) and (not pathEdges[i][1]):
                features[self.featureSet.getId("depLeft1_"+pathEdges[i-1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("depLeft2_"+pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("tokLeftPOS_"+pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId("tokLeftTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif (not pathEdges[i-1][1]) and pathEdges[i][1]:
                features[self.featureSet.getId("depTop1_"+pathEdges[i-1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("depTop2_"+pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("tokTopPOS_"+pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId("tokTopTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1
            elif pathEdges[i-1][1] and (not pathEdges[i][1]):
                features[self.featureSet.getId("depBottom1_"+pathEdges[i-1][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("depBottom2_"+pathEdges[i][0][2].attrib["type"])] = 1
                features[self.featureSet.getId("tokBottomPOS_"+pathTokens[i].attrib["POS"])] = 1
                features[self.featureSet.getId("tokBottomTxt_"+sentenceGraph.getTokenText(pathTokens[i]))] = 1
    
    def buildPathEdgeFeatures(self, depEdge, sentenceGraph, features):
        depType = depEdge[2].attrib["type"]
        features[self.featureSet.getId("dep_"+depType)] = 1
        # Token 1
        features[self.featureSet.getId("txt_"+sentenceGraph.getTokenText(depEdge[0]))] = 1
        features[self.featureSet.getId("POS_"+depEdge[0].attrib["POS"])] = 1
        self.addType(depEdge[0], features, sentenceGraph, prefix="annType_")
        # Token 2
        features[self.featureSet.getId("txt_"+sentenceGraph.getTokenText(depEdge[1]))] = 1
        features[self.featureSet.getId("POS_"+depEdge[1].attrib["POS"])] = 1
        self.addType(depEdge[1], features, sentenceGraph, prefix="annType_")
    
    def buildEdgeCombinations(self, edges, sentenceGraph, features):
        # Edges directed relative to the path
        for i in range(len(edges)):
            depType = edges[i][0][2].attrib["type"]
            if edges[i][1]:
                features[self.featureSet.getId("dep_"+depType+">")] = 1
            else:
                features[self.featureSet.getId("dep_<"+depType)] = 1
        # Edge bigrams
        if edges[0][1]:
            features[self.featureSet.getId("internalPOS_"+edges[0][0][0].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[0][0][0]))]=1
        else:
            features[self.featureSet.getId("internalPOS_"+edges[0][0][1].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[0][0][1]))]=1
        if edges[-1][1]:
            features[self.featureSet.getId("internalPOS_"+edges[-1][0][1].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[-1][0][1]))]=1
        else:
            features[self.featureSet.getId("internalPOS_"+edges[-1][0][0].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[-1][0][0]))]=1
        for i in range(1,len(edges)-1):
            features[self.featureSet.getId("internalPOS_"+edges[i][0][0].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[i][0][0]))]=1
            features[self.featureSet.getId("internalPOS_"+edges[i][0][1].attrib["POS"])]=1
            features[self.featureSet.getId("internalTxt_"+sentenceGraph.getTokenText(edges[i][0][1]))]=1
            features[self.featureSet.getId("internalDep_"+edges[i][0][2].attrib["type"])]=1
        for i in range(1,len(edges)):
            type1 = edges[i-1][0][2].attrib["type"]
            type2 = edges[i][0][2].attrib["type"]
            if edges[i-1][1] and edges[i][1]:
                features[self.featureSet.getId("dep_"+type1+">"+type2+">")] = 1
            elif edges[i-1][1] and edges[i][0]:
                features[self.featureSet.getId("dep_"+type1+">"+type2+"<")] = 1
            elif edges[i-1][0] and edges[i][0]:
                features[self.featureSet.getId("dep_"+type1+"<"+type2+"<")] = 1
            elif edges[i-1][0] and edges[i][1]:
                features[self.featureSet.getId("dep_"+type1+"<"+type2+">")] = 1
   
    def buildTerminusFeatures(self, token, prefix, sentenceGraph, features): 
        # Attached edges
        t1InEdges = sentenceGraph.dependencyGraph.in_edges(token)
        for edge in t1InEdges:
            features[self.featureSet.getId(prefix+"HangingIn_"+edge[2].attrib["type"])] = 1
            features[self.featureSet.getId(prefix+"HangingIn_"+edge[0].attrib["POS"])] = 1
            features[self.featureSet.getId("t1HangingIn_"+sentenceGraph.getTokenText(edge[0]))] = 1
        t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token)
        for edge in t1OutEdges:
            features[self.featureSet.getId(prefix+"HangingOut_"+edge[2].attrib["type"])] = 1
            features[self.featureSet.getId(prefix+"HangingOut_"+edge[1].attrib["POS"])] = 1
            features[self.featureSet.getId("t1HangingOut_"+sentenceGraph.getTokenText(edge[1]))] = 1