Python PorterStemmer.stem примеры использования

Язык программирования: Python

Пространство имен/Пакет: Utils.Libraries

Класс/Тип: PorterStemmer

Метод/Функция: stem

Примеров на hotexamples.com: 17

Python PorterStemmer.stem - 17 примеров найдено. Это лучшие примеры Python кода для Utils.Libraries.PorterStemmer.stem, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

stem(9)

Пример #1

Показать файл

Файл: ModifierExampleBuilder.py Проект: DUT-LiuYang/TEES

 def getTokenFeatures(self, token, sentenceGraph):
     """
     Returns a list of features based on the attributes of a token.
     These can be used to define more complex features.
     """
     # These features are cached when this method is first called
     # for a token.
     if self.tokenFeatures.has_key(token):
         return self.tokenFeatures[token]
     tokTxt=sentenceGraph.getTokenText(token)
     features = {}
     features["_txt_"+tokTxt]=1
     features["_POS_"+token.get("POS")]=1
     if self.styles["speculation_words"]: 
         if tokTxt in self.specWords:
             features["_spec"]=1
             features["_spec_"+tokTxt]=1
         tokStem = PorterStemmer.stem(tokTxt)
         if tokStem in self.specWordStems:
             features["_spec_stem"]=1
             features["_spec_stem_"+tokStem]=1
     if sentenceGraph.tokenIsName[token]:
         features["_given"]=1
         for entity in sentenceGraph.tokenIsEntityHead[token]:
             if entity.get("given") == "True":
                 features["_annType_"+entity.get("type")]=1
     if self.gazetteer and tokTxt.lower() in self.gazetteer:
         for label,weight in self.gazetteer[tokTxt.lower()].items():
             pass
             #features["_knownLabel_"+label]=weight
     self.tokenFeatures[token] = features
     return features

Пример #2

Показать файл

 def getTokenFeatures(self, token, sentenceGraph):
     """
     Returns a list of features based on the attributes of a token.
     These can be used to define more complex features.
     """
     # These features are cached when this method is first called
     # for a token.
     if self.tokenFeatures.has_key(token):
         return self.tokenFeatures[token]
     tokTxt = sentenceGraph.getTokenText(token)
     features = {}
     features["_txt_" + tokTxt] = 1
     features["_POS_" + token.get("POS")] = 1
     if self.styles["speculation_words"]:
         if tokTxt in self.specWords:
             features["_spec"] = 1
             features["_spec_" + tokTxt] = 1
         tokStem = PorterStemmer.stem(tokTxt)
         if tokStem in self.specWordStems:
             features["_spec_stem"] = 1
             features["_spec_stem_" + tokStem] = 1
     if sentenceGraph.tokenIsName[token]:
         features["_given"] = 1
         for entity in sentenceGraph.tokenIsEntityHead[token]:
             if entity.get("given") == "True":
                 features["_annType_" + entity.get("type")] = 1
     if self.gazetteer and tokTxt.lower() in self.gazetteer:
         for label, weight in self.gazetteer[tokTxt.lower()].items():
             pass
             #features["_knownLabel_"+label]=weight
     self.tokenFeatures[token] = features
     return features

Пример #3

Показать файл

Файл: DetectHeads.py Проект: ninjin/TEES

def mapSplits(splits, string, stringOffset):
    """
    Maps substrings to a string, and stems them
    """
    begin = 0
    tuples = []
    for split in splits:
        offset = string.find(split, begin)
        assert offset != -1
        tuples.append( (split, PorterStemmer.stem(split), (offset,len(split))) )
        begin = offset + len(split)
    return tuples

Пример #4

Показать файл

Файл: FeatureBuilder.py Проект: sbnlp/2017BioNLPEvaluation

 def getTokenFeatures(self, token, sentenceGraph, text=True, POS=True, annotatedType=True, stem=False, ontology=True):
     """
     Token features are features describing an isolated word token. These subfeatures are often merged into
     such features like n-grams. This method produces and caches a set of feature names for a token in
     the sentenceGraph sentence. The various flags can be used to choose which attributes will be included in the
     feature name list.
     
     @type token: cElementTree.Element
     @param token: a word token 
     @type sentenceGraph: SentenceGraph
     @param sentenceGraph: the sentence to which the token belongs
     @type text: boolean
     @type POS: boolean
     @type annotatedType: boolean
     @type stem: boolean
     @type ontology: boolean         
     """
     callId = token.get("id") + str(text) + str(POS) + str(annotatedType) + str(stem) + str(ontology)
     if self.tokenFeatures.has_key(callId):
         return self.tokenFeatures[callId]
     
     featureList = []
     if text:
         featureList.append("txt_"+sentenceGraph.getTokenText(token))
         if (not self.maskNamedEntities) and sentenceGraph.tokenIsName[token]:
             featureList.append("txt_"+token.get("text"))
     if POS:
         pos = token.get("POS")
         if pos.find("_") != None and self.maximum:
             for split in pos.split("_"):
                 featureList.append("POS_"+split)
         featureList.append("POS_"+pos)
         #if self.getPOSSuperType(pos) != "":
         #    featureList.append("POSX_"+self.getPOSSuperType(pos))
     if annotatedType and not self.noAnnType:
         annTypes = self.getTokenAnnotatedType(token, sentenceGraph)
         if "noAnnType" in annTypes and not self.maximum:
             annTypes.remove("noAnnType")
         for annType in annTypes:
             featureList.append("annType_"+annType)
         if ontology and (self.ontologyFeatureBuilder != None):
             for annType in annTypes:
                 featureList.extend(self.ontologyFeatureBuilder.getParents(annType))
     if stem:
         featureList.append("stem_" + PorterStemmer.stem(sentenceGraph.getTokenText(token)))
     
     if self.style != None and self.style["metamap"]:
         metamapFeatureDict = {}
         self.getMetaMapFeatures(token, sentenceGraph, metamapFeatureDict)
         featureList.extend(sorted(metamapFeatureDict.keys()))
     
     self.tokenFeatures[callId] = featureList            
     return featureList

Пример #5

Показать файл

Файл: DetectHeads.py Проект: sbnlp/2017BioNLPEvaluation

def mapSplits(splits, string, stringOffset):
    """
    Maps substrings to a string, and stems them
    """
    begin = 0
    tuples = []
    for split in splits:
        offset = string.find(split, begin)
        assert offset != -1
        tuples.append((split, PorterStemmer.stem(split), (offset, len(split))))
        begin = offset + len(split)
    return tuples

Пример #6

Показать файл

Файл: ModifierExampleBuilder.py Проект: DUT-LiuYang/TEES

def readWords(words):
    if type(words) in types.StringTypes:
        wordSet = set()    
        f = open(filename)
        for line in f.readlines():
            wordSet.add(line.strip())
        f.close()
    else: # assume it's a list
        wordSet = set(words)
    stemSet = set()
    for word in wordSet:
        stemSet.add(PorterStemmer.stem(word))
    return wordSet, stemSet

Пример #7

Показать файл

def readWords(words):
    if type(words) in types.StringTypes:
        wordSet = set()
        f = open(filename)
        for line in f.readlines():
            wordSet.add(line.strip())
        f.close()
    else:  # assume it's a list
        wordSet = set(words)
    stemSet = set()
    for word in wordSet:
        stemSet.add(PorterStemmer.stem(word))
    return wordSet, stemSet

Пример #8

Показать файл

Файл: DetectHeads.py Проект: ninjin/TEES

def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("isName") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict

Пример #9

Показать файл

Файл: DetectHeads.py Проект: sbnlp/2017BioNLPEvaluation

def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("given") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict

Пример #10

Показать файл

    def buildFeatures(self, token, linear=True, chains=True):
        sentenceGraph = self.sentenceGraph
        tokenIndex = None
        for i in range(len(self.sentenceGraph.tokens)):
            if token == self.sentenceGraph.tokens[i]:
                tokenIndex = i
                break
        assert tokenIndex != None
        token = self.sentenceGraph.tokens[tokenIndex]

        #if not "names" in self.styles:
        self.setFeature(self.namedEntityCountFeature, 1)

        #self.features.update(self.bowFeatures) # Note! these do not get tagged

        #            for j in range(len(sentenceGraph.tokens)):
        #                text = "bow_" + sentenceGraph.tokens[j].get("text")
        #                if j < i:
        #                    features[self.featureSet.getId("bf_" + text)] = 1
        #                elif j > i:
        #                    features[self.featureSet.getId("af_" + text)] = 1

        # Main features
        text = token.get("text")
        self.setFeature("txt_" + text, 1)
        self.setFeature("POS_" + token.get("POS"), 1)
        stem = PorterStemmer.stem(text)
        self.setFeature("stem_" + stem, 1)
        self.setFeature("nonstem_" + text[len(stem):], 1)

        # Linear order features
        if linear:
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index))

        # Content
        if i > 0 and text[0].isalpha() and text[0].isupper():
            self.setFeature("upper_case_start", 1)
        for j in range(len(text)):
            if j > 0 and text[j].isalpha() and text[j].isupper():
                self.setFeature("upper_case_middle", 1)
            # numbers and special characters
            if text[j].isdigit():
                self.setFeature("has_digits", 1)
                if j > 0 and text[j - 1] == "-":
                    self.setFeature("has_hyphenated_digit", 1)
            elif text[j] == "-":
                self.setFeature("has_hyphen", 1)
            elif text[j] == "/":
                self.setFeature("has_fslash", 1)
            elif text[j] == "\\":
                self.setFeature("has_bslash", 1)
            # duplets
            if j > 0:
                self.setFeature("dt_" + text[j - 1:j + 1].lower(), 1)
            # triplets
            if j > 1:
                self.setFeature("tt_" + text[j - 2:j + 1].lower(), 1)

        # chains
        if chains:
            self.buildChains(token, sentenceGraph)

Пример #11

Показать файл

Файл: EntityExampleBuilder.py Проект: DUT-LiuYang/TEES

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """       
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") 
            return 0 #[]
        
        #examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}
        
        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass("GIVEN", "ENTITY"): # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]: # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]: # manually force the setting
            buildForNameless = False
        
        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("given") == "True": # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers
                return 0 #[]
            
            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)
            
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue           
            
            tokenText = token.get("text").lower()
#            if "stem_gazetteer" in self.styles:
#                tokenText = PorterStemmer.stem(tokenText)
#            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
#                features = {}
#                features[self.featureSet.getId("exclude_gazetteer")] = 1
#                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
#                if entityIds != None:
#                    extra["goldIds"] = entityIds
#                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
#                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
#                exampleIndex += 1
#                continue
            
            # FEATURES
            features = {}
            
            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
            if normalizedText == "bound": # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_"+normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_"+norStem)] = 1
            features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1
            
            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1
            
            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_"+stringLower)] = 1
                features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Linear n-grams
            if self.styles["linear_ngrams"]:
                self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features)
                self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features)
            
            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1
                
            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            
            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)
            
            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index+1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)  
                self.drugFeatureBuilder.setFeatureVector(None)
            
            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_"+wordNetFeature)] = 1
                #print
            
            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)
                             
            extra = {"xtype":"token","t":token.get("id")}
            if self.styles["bb_features"]:
                extra["trigex"] = "bb" # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi" # Request trigger type unmerging
            if entityIds != None:
                extra["goldIds"] = entityIds # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            
            # chains
            self.buildChains(token, sentenceGraph, features)
            
            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)
            
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex

Пример #12

Показать файл

    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return 0  #[]

        #examples = []
        exampleIndex = 0

        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}

        # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped
        buildForNameless = False
        if structureAnalyzer and not structureAnalyzer.hasGroupClass(
                "GIVEN", "ENTITY"
        ):  # no given entities points to no separate NER program being used
            buildForNameless = True
        if self.styles["build_for_nameless"]:  # manually force the setting
            buildForNameless = True
        if self.styles["skip_for_nameless"]:  # manually force the setting
            buildForNameless = False

        # determine whether sentences with no given entities should be skipped
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "given"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not buildForNameless:  # no names, no need for triggers
                return 0  #[]

            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)
        else:
            for key in sentenceGraph.tokenIsName.keys():
                sentenceGraph.tokenIsName[key] = False

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(
                    sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)

            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles[
                    "names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)
            if category == None:
                self.exampleStats.filter("undefined_class")
                self.exampleStats.endExample()
                continue

            tokenText = token.get("text").lower()
            #            if "stem_gazetteer" in self.styles:
            #                tokenText = PorterStemmer.stem(tokenText)
            #            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
            #                features = {}
            #                features[self.featureSet.getId("exclude_gazetteer")] = 1
            #                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
            #                if entityIds != None:
            #                    extra["goldIds"] = entityIds
            #                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            #                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
            #                exampleIndex += 1
            #                continue

            # FEATURES
            features = {}

            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1

            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_" + stringLower)] = 1
                features[self.featureSet.getId(
                    "substringstem_" + PorterStemmer.stem(stringLower))] = 1

            if not self.styles["no_context"]:
                # Linear order features
                for index in [-3, -2, -1, 1, 2, 3]:
                    if i + index > 0 and i + index < len(sentenceGraph.tokens):
                        self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                      str(index), features)

                # Linear n-grams
                if self.styles["linear_ngrams"]:
                    self.buildLinearNGram(max(0, i - 1), i, sentenceGraph,
                                          features)
                    self.buildLinearNGram(max(0, i - 2), i, sentenceGraph,
                                          features)

            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:3].lower())] = 1

            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            if not self.styles["no_context"]:
                t1InEdges = self.inEdgesByToken[token]
                for edge in t1InEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HIn_" +
                                                   edge[0].get("POS"))] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   edge[0].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[0])
                    features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HIn_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1
                t1OutEdges = self.outEdgesByToken[token]
                for edge in t1OutEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                    features[self.featureSet.getId("t1HOut_" +
                                                   edge[1].get("POS"))] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   edge[1].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[1])
                    features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenText)] = 1
                    tokenStem = PorterStemmer.stem(tokenText)
                    features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                                   tokenStem)] = 1
                    features[self.featureSet.getId("t1HOut_" + norStem + "_" +
                                                   edgeType + "_" +
                                                   tokenStem)] = 1

            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(
                    sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)

            # DDI13 features
            if self.styles["ddi13_features"]:
                for index in range(len(normalizedText)):
                    features[self.featureSet.getId("ddi13_fromstart" +
                                                   str(index) + "_" +
                                                   normalizedText[:index +
                                                                  1])] = 1
                    features[self.featureSet.getId("ddi13_fromend" +
                                                   str(index) + "_" +
                                                   normalizedText[index:])] = 1
            if self.styles["drugbank_features"]:
                self.drugFeatureBuilder.setFeatureVector(features)
                self.drugFeatureBuilder.tag = "ddi_"
                self.drugFeatureBuilder.buildDrugFeatures(token)
                self.drugFeatureBuilder.setFeatureVector(None)

            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(
                    tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_" + wordNetFeature)] = 1
                #print

            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(
                    token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)

            if self.styles["ontobiotope_features"]:
                self.ontobiotopeFeatureBuilder.setFeatureVector(features)
                self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token)
                self.ontobiotopeFeatureBuilder.setFeatureVector(None)

            extra = {"xtype": "token", "t": token.get("id")}
            if self.styles["bb_features"]:
                extra[
                    "trigex"] = "bb"  # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi"  # Request trigger type unmerging
            if entityIds != None:
                extra[
                    "goldIds"] = entityIds  # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )

            if self.styles["bb_spans"]:
                for span in sentenceGraph.sentenceElement.iter("span"):
                    if span.get("headOffset") != token.get("charOffset"):
                        continue
                    #if span.get("source") != "spec":
                    #    continue
                    #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id")
                    features[self.featureSet.getId("span_found")] = 1
                    features[self.featureSet.getId(
                        "span_count")] = 1 + features.get(
                            self.featureSet.getId("span_count"), 0)
                    features[self.featureSet.getId("span_identifier" +
                                                   span.get("identifier"))] = 1
                    features[self.featureSet.getId("span_type" +
                                                   span.get("type"))] = 1
                    features[self.featureSet.getId("span_category" +
                                                   span.get("category"))] = 1
                    features[self.featureSet.getId("span_source" +
                                                   span.get("source"))] = 1

                    if "define_offset" in extra:
                        prevOffset = [
                            int(x) for x in extra["define_offset"].split("-")
                        ]
                        assert len(prevOffset) == 2
                        newOffset = [
                            int(x) for x in span.get("charOffset").split("-")
                        ]
                        assert len(newOffset) == 2
                        prevOffsetRange = abs(prevOffset[0] - prevOffset[1])
                        newOffsetRange = abs(newOffset[0] - newOffset[1])
                        if newOffsetRange > prevOffsetRange:
                            extra["define_offset"] = span.get("charOffset")
                    else:
                        extra["define_offset"] = span.get("charOffset")
                features[self.featureSet.getId("span_count_" + str(
                    features.get(self.featureSet.getId("span_count"), 0)))] = 1

            # chains
            if not self.styles["no_context"]:
                self.buildChains(token, sentenceGraph, features)

            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            if self.styles["wordvector"]:
                self.wordVectorFeatureBuilder.setFeatureVector(features)
                self.wordVectorFeatureBuilder.buildFeatures(token)
                self.wordVectorFeatureBuilder.setFeatureVector(None)

            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex

Пример #13

Показать файл

Файл: ParseGraph.py Проект: sbnlp/2017BioNLPEvaluation

 def stemTokens(self):
     for token in self.tokensById.values():
         token.stem = stemmer.stem(token.text)

Пример #14

Показать файл

    def buildExamplesFromGraph(self,
                               sentenceGraph,
                               outfile,
                               goldGraph=None,
                               structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                sentenceGraph.entities, goldGraph.entities)

        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get(
                    "given"
            ) == "True":  # known data which can be used for features
                namedEntityCount += 1
            else:  # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1

            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_" + text):
                    bagOfWords["spec_bow_" + text] = 0
                bagOfWords["spec_bow_" + text] += 1
                bagOfWords["spec_sentence"] = 1

        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("given") == "True":
                continue

            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
            #IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
            #ENDIF
            #features[self.featureSet.getId(entityType)] = 1

            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1

            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)

            extra = {
                "xtype": "task3",
                "t3type": task3Type,
                "t": token.get("id"),
                "entity": entity.get("id")
            }
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex

Пример #15

Показать файл

Файл: ModifierExampleBuilder.py Проект: DUT-LiuYang/TEES

    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities)
        
        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get("given") == "True": # known data which can be used for features
                namedEntityCount += 1
            else: # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            
            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_"+text):
                    bagOfWords["spec_bow_"+text] = 0
                bagOfWords["spec_bow_"+text] += 1
                bagOfWords["spec_sentence"] = 1
        
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("given") == "True":
                continue
            
            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)  
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
#IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be 
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
#ENDIF            
            #features[self.featureSet.getId(entityType)] = 1
            
            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1
            
            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)
             
            extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")}
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1            
            self.exampleStats.endExample()
        #return examples
        return exampleIndex

Пример #16

Показать файл

Файл: TriggerFeatureBuilder.py Проект: DUT-LiuYang/TEES

    def buildFeatures(self, token, linear=True, chains=True):
        sentenceGraph = self.sentenceGraph
        tokenIndex = None
        for i in range(len(self.sentenceGraph.tokens)):
            if token == self.sentenceGraph.tokens[i]:
                tokenIndex = i
                break
        assert tokenIndex != None
        token = self.sentenceGraph.tokens[tokenIndex]

        # if not "names" in self.styles:
        self.setFeature(self.namedEntityCountFeature, 1)

        # self.features.update(self.bowFeatures) # Note! these do not get tagged

        #            for j in range(len(sentenceGraph.tokens)):
        #                text = "bow_" + sentenceGraph.tokens[j].get("text")
        #                if j < i:
        #                    features[self.featureSet.getId("bf_" + text)] = 1
        #                elif j > i:
        #                    features[self.featureSet.getId("af_" + text)] = 1

        # Main features
        text = token.get("text")
        self.setFeature("txt_" + text, 1)
        self.setFeature("POS_" + token.get("POS"), 1)
        stem = PorterStemmer.stem(text)
        self.setFeature("stem_" + stem, 1)
        self.setFeature("nonstem_" + text[len(stem) :], 1)

        # Linear order features
        if linear:
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index))

        # Content
        if i > 0 and text[0].isalpha() and text[0].isupper():
            self.setFeature("upper_case_start", 1)
        for j in range(len(text)):
            if j > 0 and text[j].isalpha() and text[j].isupper():
                self.setFeature("upper_case_middle", 1)
            # numbers and special characters
            if text[j].isdigit():
                self.setFeature("has_digits", 1)
                if j > 0 and text[j - 1] == "-":
                    self.setFeature("has_hyphenated_digit", 1)
            elif text[j] == "-":
                self.setFeature("has_hyphen", 1)
            elif text[j] == "/":
                self.setFeature("has_fslash", 1)
            elif text[j] == "\\":
                self.setFeature("has_bslash", 1)
            # duplets
            if j > 0:
                self.setFeature("dt_" + text[j - 1 : j + 1].lower(), 1)
            # triplets
            if j > 1:
                self.setFeature("tt_" + text[j - 2 : j + 1].lower(), 1)

        # chains
        if chains:
            self.buildChains(token, sentenceGraph)

Пример #17

Показать файл

Файл: ParseGraph.py Проект: DUT-LiuYang/TEES

 def stemTokens(self):
     for token in self.tokensById.values():
         token.stem = stemmer.stem(token.text)