Пример #1
0
 def getTokenFeatures(self, token, sentenceGraph):
     """
     Returns a list of features based on the attributes of a token.
     These can be used to define more complex features.
     """
     # These features are cached when this method is first called
     # for a token.
     if self.tokenFeatures.has_key(token):
         return self.tokenFeatures[token]
     tokTxt=sentenceGraph.getTokenText(token)
     features = {}
     features["_txt_"+tokTxt]=1
     features["_POS_"+token.get("POS")]=1
     if self.styles["speculation_words"]: 
         if tokTxt in self.specWords:
             features["_spec"]=1
             features["_spec_"+tokTxt]=1
         tokStem = PorterStemmer.stem(tokTxt)
         if tokStem in self.specWordStems:
             features["_spec_stem"]=1
             features["_spec_stem_"+tokStem]=1
     if sentenceGraph.tokenIsName[token]:
         features["_isName"]=1
         for entity in sentenceGraph.tokenIsEntityHead[token]:
             if entity.get("isName") == "True":
                 features["_annType_"+entity.get("type")]=1
     if self.gazetteer and tokTxt.lower() in self.gazetteer:
         for label,weight in self.gazetteer[tokTxt.lower()].items():
             pass
             #features["_knownLabel_"+label]=weight
     self.tokenFeatures[token] = features
     return features
 def getTokenFeatures(self, token, sentenceGraph):
     """
     Returns a list of features based on the attributes of a token.
     These can be used to define more complex features.
     """
     # These features are cached when this method is first called
     # for a token.
     if self.tokenFeatures.has_key(token):
         return self.tokenFeatures[token]
     tokTxt=sentenceGraph.getTokenText(token)
     features = {}
     features["_txt_"+tokTxt]=1
     features["_POS_"+token.get("POS")]=1
     if sentenceGraph.tokenIsName[token]:
         features["_isName"]=1
         for entity in sentenceGraph.tokenIsEntityHead[token]:
             if entity.get("isName") == "True":
                 features["_annType_"+entity.get("type")]=1
     # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
     if "gazetteer_features" in self.styles:
         tokTxtLower = tokTxt.lower()
         if "stem_gazetteer" in self.styles:
             tokTxtLower = PorterStemmer.stem(tokTxtLower)
         if self.gazetteer and tokTxtLower in self.gazetteer:
             for label,weight in self.gazetteer[tokTxtLower].items():
                 features["_knownLabel_"+label]=weight # 1 performs slightly worse
     self.tokenFeatures[token] = features
     return features
Пример #3
0
    def getGazetteerMatch(self, string):
        if string in self.gazMatchCache:
            return self.gazMatchCache[string]

        origString = string
        if "stem_gazetteer" in self.styles:
            string = PorterStemmer.stem(string)

        if string in self.gazetteer:
            self.gazMatchCache[origString] = string
            return string
        elif string.find("-") != -1:
            replaced = string.replace("-", "")
        else:
            self.gazMatchCache[origString] = None
            return None

        if replaced in self.gazetteer:
            self.gazMatchCache[origString] = replaced
            return replaced
        else:
            splitted = string.rsplit("-", 1)[-1]

        if splitted in self.gazetteer:
            self.gazMatchCache[origString] = splitted
            return splitted
        else:
            self.gazMatchCache[origString] = None
            return None
Пример #4
0
    def getTokenFeatures(self, token, sentenceGraph):
        """
        Returns a list of features based on the attributes of a token.
        These can be used to define more complex features.
        """
        # These features are cached when this method is first called
        # for a token.
        if self.tokenFeatures.has_key(token):
            return self.tokenFeatures[token]
        tokTxt = sentenceGraph.getTokenText(token)
        features = {}
        features["_txt_" + tokTxt] = 1

        # F 69.35 -> 68.22
        #normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
        #features["_norTxt_"+normalizedText]=1
        #features["_norStem_" + PorterStemmer.stem(normalizedText)]=1

        features["_POS_" + token.get("POS")] = 1
        if sentenceGraph.tokenIsName[token]:
            features["_isName"] = 1
            for entity in sentenceGraph.tokenIsEntityHead[token]:
                if entity.get("isName") == "True":
                    features["_annType_" + entity.get("type")] = 1
        # Filip's gazetteer based features (can be used separately from exclude_gazetteer)
        if "gazetteer_features" in self.styles:
            tokTxtLower = tokTxt.lower()
            if "stem_gazetteer" in self.styles:
                tokTxtLower = PorterStemmer.stem(tokTxtLower)
            if self.gazetteer and tokTxtLower in self.gazetteer:
                for label, weight in self.gazetteer[tokTxtLower].items():
                    features["_knownLabel_" +
                             label] = weight  # 1 performs slightly worse
        self.tokenFeatures[token] = features
        return features
Пример #5
0
 def getGazetteerMatch(self, string):
     if string in self.gazMatchCache:
         return self.gazMatchCache[string]
     
     origString = string
     if "stem_gazetteer" in self.styles:
         string = PorterStemmer.stem(string)
     
     if string in self.gazetteer:
         self.gazMatchCache[origString] = string
         return string
     elif string.find("-") != -1:
         replaced = string.replace("-","")
     else:
         self.gazMatchCache[origString] = None
         return None
     
     if replaced in self.gazetteer:
         self.gazMatchCache[origString] = replaced
         return replaced
     else:
         splitted = string.rsplit("-",1)[-1]
     
     if splitted  in self.gazetteer:
         self.gazMatchCache[origString] = splitted
         return splitted
     else:
         self.gazMatchCache[origString] = None
         return None
Пример #6
0
 def getTokenFeatures(self, token, sentenceGraph):
     """
     Returns a list of features based on the attributes of a token.
     These can be used to define more complex features.
     """
     # These features are cached when this method is first called
     # for a token.
     if self.tokenFeatures.has_key(token):
         return self.tokenFeatures[token]
     tokTxt = sentenceGraph.getTokenText(token)
     features = {}
     features["_txt_" + tokTxt] = 1
     features["_POS_" + token.get("POS")] = 1
     if self.styles["speculation_words"]:
         if tokTxt in self.specWords:
             features["_spec"] = 1
             features["_spec_" + tokTxt] = 1
         tokStem = PorterStemmer.stem(tokTxt)
         if tokStem in self.specWordStems:
             features["_spec_stem"] = 1
             features["_spec_stem_" + tokStem] = 1
     if sentenceGraph.tokenIsName[token]:
         features["_isName"] = 1
         for entity in sentenceGraph.tokenIsEntityHead[token]:
             if entity.get("isName") == "True":
                 features["_annType_" + entity.get("type")] = 1
     if self.gazetteer and tokTxt.lower() in self.gazetteer:
         for label, weight in self.gazetteer[tokTxt.lower()].items():
             pass
             #features["_knownLabel_"+label]=weight
     self.tokenFeatures[token] = features
     return features
Пример #7
0
    def printStats(self):
        eventsByType = {}
        for event in self.eventsByOrigId.values():
            eventsByType[event.get("type")] = eventsByType.get(event.get("type"),0) + 1
        
        f = open("missed-events", "wt")
        missedEvents = {}
        for key in self.examplesByEventOrigId.keys():
            if self.examplesByEventOrigId[key] == 0:
                if not missedEvents.has_key(self.eventsByOrigId[key].get("type")):
                    missedEvents[self.eventsByOrigId[key].get("type")] = []
                missedEvents[self.eventsByOrigId[key].get("type")].append(key)

        for key in sorted(missedEvents.keys()):
            f.write(key + "\n")
            for id in sorted(missedEvents[key]):
                f.write(" " + id + " ")
                if id in self.interSentenceEvents:
                    f.write("intersentence ")
                text = self.headTokensByOrigId[id].get("text").lower()
                if not self.isInGazetteer(text):
                    text = self.headTokensByOrigId[id].get("text").lower()
                    if "stem_gazetteer" in self.styles:
                        stemmed = PorterStemmer.stem(text)
                    f.write("not-in-gazetteer (" + text + " / " + stemmed +")" )
                f.write("\n")
        f.close()

        print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)"
        for key in sorted(eventsByType.keys()):
            inter = 0
            other = 0
            nongaz = 0
            if missedEvents.has_key(key):
                for id in missedEvents[key]:
                    tokText = self.headTokensByOrigId[id].get("text").lower()
                    if id in self.interSentenceEvents:
                        inter += 1
                    elif not self.isInGazetteer(tokText):
                        nongaz += 1
                    else:
                        other += 1
            if inter == other == nongaz == 0:
                print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): missed none"
            else:
                print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): " + str(other) + ", " + str(inter) + ", " + str(nongaz)
        print >> sys.stderr, "Example generation (total, built/skipped)"
        for key in sorted(list(set(self.skippedByType.keys() + self.builtByType.keys()))):
            string = " " + key + ": (" + str(self.builtByType.get(key,0)+self.skippedByType.get(key,0)) + ", " + str(self.builtByType.get(key,0)) + "/" + str(self.skippedByType.get(key,0)) + ") ["
            for key2 in sorted(self.skippedByTypeAndReason[key].keys()):
                string += key2 + ":" + str(self.skippedByTypeAndReason[key][key2]) + " "
            string += "]"
            print >> sys.stderr, string
Пример #8
0
def mapSplits(splits, string, stringOffset):
    """
    Maps substrings to a string, and stems them
    """
    begin = 0
    tuples = []
    for split in splits:
        offset = string.find(split, begin)
        assert offset != -1
        tuples.append( (split, PorterStemmer.stem(split), (offset,len(split))) )
        begin = offset + len(split)
    return tuples
Пример #9
0
def mapSplits(splits, string, stringOffset):
    """
    Maps substrings to a string, and stems them
    """
    begin = 0
    tuples = []
    for split in splits:
        offset = string.find(split, begin)
        assert offset != -1
        tuples.append((split, PorterStemmer.stem(split), (offset, len(split))))
        begin = offset + len(split)
    return tuples
Пример #10
0
def readWords(words):
    if type(words) in types.StringTypes:
        wordSet = set()
        f = open(filename)
        for line in f.readlines():
            wordSet.add(line.strip())
        f.close()
    else:  # assume it's a list
        wordSet = set(words)
    stemSet = set()
    for word in wordSet:
        stemSet.add(PorterStemmer.stem(word))
    return wordSet, stemSet
Пример #11
0
def readWords(words):
    if type(words) in types.StringTypes:
        wordSet = set()    
        f = open(filename)
        for line in f.readlines():
            wordSet.add(line.strip())
        f.close()
    else: # assume it's a list
        wordSet = set(words)
    stemSet = set()
    for word in wordSet:
        stemSet.add(PorterStemmer.stem(word))
    return wordSet, stemSet
Пример #12
0
def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("isName") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict
Пример #13
0
def getTriggers(corpus):
    """
    Returns a dictionary of "entity type"->"entity text"->"count"
    """
    corpus = ETUtils.ETFromObj(corpus)
    trigDict = {}
    for entity in corpus.getroot().getiterator("entity"):
        if entity.get("isName") == "True":
            continue
        eType = entity.get("type")
        if not trigDict.has_key(eType):
            trigDict[eType] = {}
        eText = entity.get("text")
        eText = PorterStemmer.stem(eText)
        if not trigDict[eType].has_key(eText):
            trigDict[eType][eText] = 0
        trigDict[eType][eText] += 1
    return trigDict
Пример #14
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities)
        
        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get("isName") == "True": # known data which can be used for features
                namedEntityCount += 1
            else: # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            
            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_"+text):
                    bagOfWords["spec_bow_"+text] = 0
                bagOfWords["spec_bow_"+text] += 1
                bagOfWords["spec_sentence"] = 1
        
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("isName") == "True":
                continue
            
            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)  
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
#IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be 
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
#ENDIF            
            #features[self.featureSet.getId(entityType)] = 1
            
            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1
            
            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)
             
            extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")}
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1            
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Пример #15
0
        res = re.sub(r'[^A-Za-z]', " ",
                     txt1)  # Removes numbers and special characters
        res1 = res.lower()  # Lower-case the tokens
        tokens = res1.split()
        doclen = len(tokens)
        doclst.append((index, doclen))
        # tokens = word_tokenize(res1)
        token.extend(tokens)
        # if (len(tokens) > 0):
        # print(tokens)
        # folder = os.listdir(path)

        f_token = [w for w in tokens if w not in stopwords.words('english')]

        #---------STEMS---------#
        ps = PorterStemmer()
        stem = []
        for ft in f_token:
            stem.append(ps.stem(ft, 0, len(ft) - 1))

        stemc = collections.Counter(stem)
        max_tfstem = stemc.most_common(1)
        for key, value in max_tfstem:
            max_tfs = value

        for t, tf in stemc.items():
            dst = [index, tf, max_tfs, doclen]
            slist = []
            if t in ps_stem:
                ps_stem[t].append(dst)
            else:
Пример #16
0
    def printStats(self):
        eventsByType = {}
        for event in self.eventsByOrigId.values():
            eventsByType[event.get("type")] = eventsByType.get(
                event.get("type"), 0) + 1

        f = open("missed-events", "wt")
        missedEvents = {}
        for key in self.examplesByEventOrigId.keys():
            if self.examplesByEventOrigId[key] == 0:
                if not missedEvents.has_key(
                        self.eventsByOrigId[key].get("type")):
                    missedEvents[self.eventsByOrigId[key].get("type")] = []
                missedEvents[self.eventsByOrigId[key].get("type")].append(key)

        for key in sorted(missedEvents.keys()):
            f.write(key + "\n")
            for id in sorted(missedEvents[key]):
                f.write(" " + id + " ")
                if id in self.interSentenceEvents:
                    f.write("intersentence ")
                text = self.headTokensByOrigId[id].get("text").lower()
                if not self.isInGazetteer(text):
                    text = self.headTokensByOrigId[id].get("text").lower()
                    if "stem_gazetteer" in self.styles:
                        stemmed = PorterStemmer.stem(text)
                    f.write("not-in-gazetteer (" + text + " / " + stemmed +
                            ")")
                f.write("\n")
        f.close()

        print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)"
        for key in sorted(eventsByType.keys()):
            inter = 0
            other = 0
            nongaz = 0
            if missedEvents.has_key(key):
                for id in missedEvents[key]:
                    tokText = self.headTokensByOrigId[id].get("text").lower()
                    if id in self.interSentenceEvents:
                        inter += 1
                    elif not self.isInGazetteer(tokText):
                        nongaz += 1
                    else:
                        other += 1
            if inter == other == nongaz == 0:
                print >> sys.stderr, " " + key + " (" + str(
                    eventsByType[key]) + "): missed none"
            else:
                print >> sys.stderr, " " + key + " (" + str(
                    eventsByType[key]) + "): " + str(other) + ", " + str(
                        inter) + ", " + str(nongaz)
        print >> sys.stderr, "Example generation (total, built/skipped)"
        for key in sorted(
                list(set(self.skippedByType.keys() +
                         self.builtByType.keys()))):
            string = " " + key + ": (" + str(
                self.builtByType.get(key, 0) +
                self.skippedByType.get(key, 0)) + ", " + str(
                    self.builtByType.get(key, 0)) + "/" + str(
                        self.skippedByType.get(key, 0)) + ") ["
            for key2 in sorted(self.skippedByTypeAndReason[key].keys()):
                string += key2 + ":" + str(
                    self.skippedByTypeAndReason[key][key2]) + " "
            string += "]"
            print >> sys.stderr, string
Пример #17
0
    def buildExamples(self, sentenceGraph):
        examples = []
        exampleIndex = 0

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # CLASS
            if sentenceGraph.tokenIsName[token]:
                category = 1
            else:
                category = -1

            # FEATURES
            features = {}
            # Main features
            text = token.attrib["text"]
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.attrib["POS"])] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1
            # Linear order features
            if i > 0:
                features[self.featureSet.getId(
                    "linear_-1_txt_" +
                    sentenceGraph.tokens[i - 1].attrib["text"])] = 1
                features[self.featureSet.getId(
                    "linear_-1_POS_" +
                    sentenceGraph.tokens[i - 1].attrib["POS"])] = 1
            if i > 1:
                features[self.featureSet.getId(
                    "linear_-2_txt_" +
                    sentenceGraph.tokens[i - 2].attrib["text"])] = 1
                features[self.featureSet.getId(
                    "linear_-2_POS_" +
                    sentenceGraph.tokens[i - 2].attrib["POS"])] = 1
            if i < len(sentenceGraph.tokens) - 1:
                features[self.featureSet.getId(
                    "linear_+1_txt_" +
                    sentenceGraph.tokens[i + 1].attrib["text"])] = 1
                features[self.featureSet.getId(
                    "linear_+1_POS_" +
                    sentenceGraph.tokens[i + 1].attrib["POS"])] = 1
            if i < len(sentenceGraph.tokens) - 2:
                features[self.featureSet.getId(
                    "linear_+2_txt_" +
                    sentenceGraph.tokens[i + 2].attrib["text"])] = 1
                features[self.featureSet.getId(
                    "linear_+2_POS_" +
                    sentenceGraph.tokens[i + 2].attrib["POS"])] = 1
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
            # Attached edges
            t1InEdges = sentenceGraph.dependencyGraph.in_edges(token)
            for edge in t1InEdges:
                features[self.featureSet.getId("t1HangingIn_" +
                                               edge[2].attrib["type"])] = 1
                features[self.featureSet.getId("t1HangingIn_" +
                                               edge[0].attrib["POS"])] = 1
                features[self.featureSet.getId("t1HangingIn_" +
                                               edge[0].attrib["text"])] = 1
            t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token)
            for edge in t1OutEdges:
                features[self.featureSet.getId("t1HangingOut_" +
                                               edge[2].attrib["type"])] = 1
                features[self.featureSet.getId("t1HangingOut_" +
                                               edge[1].attrib["POS"])] = 1
                features[self.featureSet.getId("t1HangingOut_" +
                                               edge[1].attrib["text"])] = 1

            extra = {"xtype": "token", "t": token}
            examples.append(
                (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                 category, features, extra))
            exampleIndex += 1
        return examples
Пример #18
0
def tokTxt(b, e, sNode, stem=False):
    if stem:
        return PorterStemmer.stem(sNode.get("text")[b:e + 1])
    else:
        return sNode.get("text")[b:e + 1]
Пример #19
0
    def buildExamples(self, sentenceGraph, exampleIndex = 0):
        examples = []
        #exampleIndex = 0
        
        namedEntityCount = 0
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if sentenceGraph.tokenIsName[token]:
                namedEntityCount += 1
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token]:
                continue
            
            if sentenceGraph.tokenIsEntityHead[token] != None:
            # CLASS
                category = 1
            else:
                category = -1
            
            # FEATURES
            features = {}
            # Main features
            textUpper = token.get("text")
            text = textUpper.lower()
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            # Dictionary features
            if text in intWords:
                features[self.featureSet.getId("dict")] = 1
                features[self.featureSet.getId("dict_def_"+wordDict[text])]=1
            # Named entity count
            features[self.featureSet.getId("neCount")] = namedEntityCount
            # Linear order features
            self.entityFeatureBuilder.setFeatureVector(features)
            self.entityFeatureBuilder.buildLinearOrderFeatures(i, sentenceGraph, 3, 3 )
            # Content
            self.entityFeatureBuilder.buildContentFeatures(i, textUpper, duplets=True, triplets=True)
            self.entityFeatureBuilder.setFeatureVector(None)
            # Attached edges
            self.edgeFeatureBuilder.setFeatureVector(features)
            t1InEdges = sentenceGraph.dependencyGraph.in_edges(token)
            for edge in t1InEdges:
                self.edgeFeatureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in_", text=True, POS=True, annType=False, maskNames=True)
#                l2Edges = sentenceGraph.dependencyGraph.in_edges(edge[0])
#                for e2 in l2Edges:
#                    self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in2_", text=True, POS=True, annType=False, maskNames=True)
#                l2Edges = sentenceGraph.dependencyGraph.out_edges(edge[0])
#                for e2 in l2Edges:
#                    self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in2_", text=True, POS=True, annType=False, maskNames=True)
                #self.featureBuilder.buildAttachedEdgeFeatures(edge, sentenceGraph, "in_att_", text=True, POS=True, annType=False, maskNames=True)       
                #self.featureBuilder.buildLinearOrderFeatures(edge)
            t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token)
            for edge in t1OutEdges:
                self.edgeFeatureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out_", text=True, POS=True, annType=False, maskNames=True)
#                l2Edges = sentenceGraph.dependencyGraph.in_edges(edge[1])
#                for e2 in l2Edges:
#                    self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out2_", text=True, POS=True, annType=False, maskNames=True)
#                l2Edges = sentenceGraph.dependencyGraph.out_edges(edge[1])
#                for e2 in l2Edges:
#                    self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out2_", text=True, POS=True, annType=False, maskNames=True)
                #self.featureBuilder.buildAttachedEdgeFeatures(edge, sentenceGraph, "out_att_", text=True, POS=True, annType=False, maskNames=True)       
                #self.featureBuilder.buildLinearOrderFeatures(edge)
            self.edgeFeatureBuilder.setFeatureVector(None)
             
            extra = {"xtype":"token","t":token}
            examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            exampleIndex += 1
        return examples
Пример #20
0
def tokTxt(b, e, sNode, stem=False):
    if stem:
        return PorterStemmer.stem(sNode.get("text")[b : e + 1])
    else:
        return sNode.get("text")[b : e + 1]
Пример #21
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return 0  #[]

        #examples = []
        exampleIndex = 0

        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}

        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "isName"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not self.styles[
                    "build_for_nameless"]:  # no names, no need for triggers
                return 0  #[]

            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(
                    sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)

            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles[
                    "names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)

            tokenText = token.get("text").lower()
            #            if "stem_gazetteer" in self.styles:
            #                tokenText = PorterStemmer.stem(tokenText)
            #            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
            #                features = {}
            #                features[self.featureSet.getId("exclude_gazetteer")] = 1
            #                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
            #                if entityIds != None:
            #                    extra["goldIds"] = entityIds
            #                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            #                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
            #                exampleIndex += 1
            #                continue

            # FEATURES
            features = {}

            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1

            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_" + stringLower)] = 1
                features[self.featureSet.getId(
                    "substringstem_" + PorterStemmer.stem(stringLower))] = 1

            # Linear order features
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Linear n-grams
            if self.styles["linear_ngrams"]:
                self.buildLinearNGram(max(0, i - 1), i, sentenceGraph,
                                      features)
                self.buildLinearNGram(max(0, i - 2), i, sentenceGraph,
                                      features)

            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_" +
                                               text[0:3].lower())] = 1

            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_" + norStem + "_" +
                                               edgeType + "_" + tokenStem)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_" + norStem + "_" +
                                               edgeType + "_" + tokenStem)] = 1

            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(
                    sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)

            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(
                    tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_" + wordNetFeature)] = 1
                #print

            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(
                    token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)

            extra = {"xtype": "token", "t": token.get("id")}
            if self.styles["bb_features"]:
                extra[
                    "trigex"] = "bb"  # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi"  # Request trigger type unmerging
            if entityIds != None:
                extra[
                    "goldIds"] = entityIds  # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )

            # chains
            self.buildChains(token, sentenceGraph, features)

            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """       
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") 
            return 0 #[]
        
        #examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        self.tokenFeatureWeights = {}
        
        namedEntityHeadTokens = []
        if not self.styles["names"]:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("isName") == "True": # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # NOTE!!! This will change the number of examples and omit
            # all triggers (positive and negative) from sentences which
            # have no NE:s, possibly giving a too-optimistic performance
            # value. Such sentences can still have triggers from intersentence
            # interactions, but as such events cannot be recovered anyway,
            # looking for these triggers would be pointless.
            if namedEntityCount == 0 and not self.styles["build_for_nameless"]: # no names, no need for triggers
                return 0 #[]
            
            if self.styles["pos_pairs"]:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k in sorted(bagOfWords.keys()):
            bowFeatures[self.featureSet.getId(k)] = bagOfWords[k]
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            #fixedInEdges = []
            #for edge in inEdges:
            #    fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #inEdges = fixedInEdges
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            #inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            #fixedOutEdges = []
            #for edge in outEdges:
            #    fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            #outEdges = fixedOutEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            #outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])
            else:
                categoryName, entityIds = "neg", None
            self.exampleStats.beginExample(categoryName)
            
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]:
                self.exampleStats.filter("name")
                self.exampleStats.endExample()
                continue
#            if "selftrain_limits" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftrain") == "False":
#                        self.exampleStats.filter("selftrain_limits")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
#            if "selftrain_group" in self.styles:
#                # any predicted entity not part of the self-training set causes example to be rejected
#                filtered = False
#                for entity in sentenceGraph.tokenIsEntityHead[token]:
#                    if entity.get("selftraingroup") not in self.selfTrainGroups:
#                        self.exampleStats.filter("selftrain_group")
#                        self.exampleStats.endExample()
#                        filtered = True
#                        break
#                if filtered:
#                    continue
            if self.styles["pos_only"] and categoryName == "neg":
                self.exampleStats.filter("pos_only")
                self.exampleStats.endExample()
                continue

            category = self.classSet.getId(categoryName)            
            
            tokenText = token.get("text").lower()
#            if "stem_gazetteer" in self.styles:
#                tokenText = PorterStemmer.stem(tokenText)
#            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
#                features = {}
#                features[self.featureSet.getId("exclude_gazetteer")] = 1
#                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
#                if entityIds != None:
#                    extra["goldIds"] = entityIds
#                #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
#                ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile)
#                exampleIndex += 1
#                continue
            
            # FEATURES
            features = {}
            
            if not self.styles["names"]:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower()
            if normalizedText == "bound": # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_"+normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_"+norStem)] = 1
            features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1
            
            ## Subspan features
            #textLower = text.lower()
            #for i in range(1, len(textLower)):
            #    features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1
            #    features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1
            
            # Substring features
            for string in text.split("-"):
                stringLower = string.lower()
                features[self.featureSet.getId("substring_"+stringLower)] = 1
                features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Linear n-grams
            if self.styles["linear_ngrams"]:
                self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features)
                self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features)
            
            if self.styles["phospho"]:
                if text.find("hospho") != -1:
                    features[self.featureSet.getId("phospho_found")] = 1
                features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1
                features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1
                
            if self.styles["bb_features"]:
                if text.lower() in self.bacteriaTokens:
                    features[self.featureSet.getId("lpsnBacToken")] = 1

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
                # quadruplets (don't work, slight decrease (0.5 pp) on f-score
                #if j > 2:
                #    features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
                tokenStem = PorterStemmer.stem(tokenText)
                features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1
                features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1
            
            # REL features
            if self.styles["rel_features"]:
                self.relFeatureBuilder.setFeatureVector(features)
                self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i)
                self.relFeatureBuilder.setFeatureVector(None)
            
            #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP")
            #tokTxt = token.get("text")
            #tokPOS = token.get("POS")
            #wordNetFeatures = []
            #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
            if self.styles["wordnet"]:
                tokTxt = token.get("text")
                tokPOS = token.get("POS")
                wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS)
                for wordNetFeature in wordNetFeatures:
                    #print wordNetFeature,
                    features[self.featureSet.getId("WN_"+wordNetFeature)] = 1
                #print
            
            if self.styles["giuliano"]:
                self.giulianoFeatureBuilder.setFeatureVector(features)
                self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph)
                self.giulianoFeatureBuilder.setFeatureVector(None)
                             
            extra = {"xtype":"token","t":token.get("id")}
            if self.styles["bb_features"]:
                extra["trigex"] = "bb" # Request trigger extension in ExampleWriter
            if self.styles["epi_merge_negated"]:
                extra["unmergeneg"] = "epi" # Request trigger type unmerging
            if entityIds != None:
                extra["goldIds"] = entityIds # The entities to which this example corresponds
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            
            # chains
            self.buildChains(token, sentenceGraph, features)
            
            if self.styles["pos_pairs"]:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)
            
            example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Пример #23
0
 def stemTokens(self):
     for token in self.tokensById.values():
         token.stem = stemmer.stem(token.text)
Пример #24
0
    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get(
                "origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(
            sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(
            sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get(
                    "charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        #interactionsByEntityId = {}
        #for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[
                sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        #namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get(
                        "isName"
                ) == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            #if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(
                    sentenceGraph)

        #neFeatures = {} # F: 69.35 -> 69.14
        #for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token,
                                                               data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[
                    token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            #if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            #else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(
                    self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles
                ) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {
                    "xtype": "token",
                    "t": token.get("id"),
                    "excluded": "True"
                }
                examples.append(
                    (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                     category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            #features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = text.replace("-", "").replace("/", "").replace(
                ",", "").replace("\\", "").replace(" ", "").lower()
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" +
                                           normalizedText[len(norStem):])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId(
                            "gaz_knownLabel_" +
                            label)] = weight  # 1 performs slightly worse

            # Linear order features
            #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append(
                (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex),
                 category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token,
                                         interactionsByToken[token])
        return examples
Пример #25
0
    30)  #Identifies the 30 most frequent tokens
for i in freq:
    print(i)

print("Average number of word tokens per document:", round(count / len(files)))

stop = datetime.now()  # Code Execution End Time
print("Time the program took to acquire the text characteristics:")
print(stop - start)

##----------------STEMMING----------------------##

k = 0
c = 0
pslist = []
ps = PorterStemmer()  # Uses PorterStemmer class defined in Stemming.py
for i in token:
    pslist.append(ps.stem(i, 0, len(i) - 1))
    c += 1
#print(set(pslist)) # Identifies the unique stems

print("##----------------STEMMING---------------------##")
print("Number of distinct stems in the Cranfield text collection:",
      len(set(pslist)))

w2 = Counter(pslist)  # Identifies the stems which occured only once
for i in pslist:
    if w2[i] == 1:
        once = 1
        k += 1
    else:
Пример #26
0
 def buildExamples(self, sentenceGraph):
     examples = []
     exampleIndex = 0
     
     for i in range(len(sentenceGraph.tokens)):
         token = sentenceGraph.tokens[i]
         # CLASS
         if sentenceGraph.tokenIsName[token]:
             category = 1
         else:
             category = -1
         
         # FEATURES
         features = {}
         # Main features
         text = token.attrib["text"]
         features[self.featureSet.getId("txt_"+text)] = 1
         features[self.featureSet.getId("POS_"+token.attrib["POS"])] = 1
         stem = PorterStemmer.stem(text)
         features[self.featureSet.getId("stem_"+stem)] = 1
         features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
         # Linear order features
         if i > 0:
             features[self.featureSet.getId("linear_-1_txt_"+sentenceGraph.tokens[i-1].attrib["text"])] = 1
             features[self.featureSet.getId("linear_-1_POS_"+sentenceGraph.tokens[i-1].attrib["POS"])] = 1
         if i > 1:
             features[self.featureSet.getId("linear_-2_txt_"+sentenceGraph.tokens[i-2].attrib["text"])] = 1
             features[self.featureSet.getId("linear_-2_POS_"+sentenceGraph.tokens[i-2].attrib["POS"])] = 1
         if i < len(sentenceGraph.tokens) - 1:
             features[self.featureSet.getId("linear_+1_txt_"+sentenceGraph.tokens[i+1].attrib["text"])] = 1
             features[self.featureSet.getId("linear_+1_POS_"+sentenceGraph.tokens[i+1].attrib["POS"])] = 1
         if i < len(sentenceGraph.tokens) - 2:
             features[self.featureSet.getId("linear_+2_txt_"+sentenceGraph.tokens[i+2].attrib["text"])] = 1
             features[self.featureSet.getId("linear_+2_POS_"+sentenceGraph.tokens[i+2].attrib["POS"])] = 1
         # Content
         if i > 0 and text[0].isalpha() and text[0].isupper():
             features[self.featureSet.getId("upper_case_start")] = 1
         for j in range(len(text)):
             if j > 0 and text[j].isalpha() and text[j].isupper():
                 features[self.featureSet.getId("upper_case_middle")] = 1
             # numbers and special characters
             if text[j].isdigit():
                 features[self.featureSet.getId("has_digits")] = 1
                 if j > 0 and text[j-1] == "-":
                     features[self.featureSet.getId("has_hyphenated_digit")] = 1
             elif text[j] == "-":
                 features[self.featureSet.getId("has_hyphen")] = 1
             elif text[j] == "/":
                 features[self.featureSet.getId("has_fslash")] = 1
             elif text[j] == "\\":
                 features[self.featureSet.getId("has_bslash")] = 1
             # duplets
             if j > 0:
                 features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
             # triplets
             if j > 1:
                 features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
         # Attached edges
         t1InEdges = sentenceGraph.dependencyGraph.in_edges(token)
         for edge in t1InEdges:
             features[self.featureSet.getId("t1HangingIn_"+edge[2].attrib["type"])] = 1
             features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["POS"])] = 1
             features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["text"])] = 1
         t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token)
         for edge in t1OutEdges:
             features[self.featureSet.getId("t1HangingOut_"+edge[2].attrib["type"])] = 1
             features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["POS"])] = 1
             features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["text"])] = 1
          
         extra = {"xtype":"token","t":token}
         examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
         exampleIndex += 1
     return examples
Пример #27
0
    def buildExamples(self, sentenceGraph):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("isName") == "True": # known data which can be used for features
                    namedEntityCount += 1
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            
            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) )
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            else:
                category = 1            
            
            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
                examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
                exampleIndex += 1
                continue

            if ("filter_pos" in self.styles) and token.get("POS") in self.excludedPOS:
                features = {}
                features[self.featureSet.getId("filter_pos")] = 1
                extra = {"xtype":"token","t":token.get("id"),"excluded":"True"}
                examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
                exampleIndex += 1
                continue
            
            # FEATURES
            features = {}
            
            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            if not "no_hanging" in self.styles:
                t1InEdges = self.inEdgesByToken[token]
                for edge in t1InEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                    features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                    features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[0])
                    features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                    features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
                t1OutEdges = self.outEdgesByToken[token]
                for edge in t1OutEdges:
                    edgeType = edge[2].get("type")
                    features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                    features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                    features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                    tokenText = sentenceGraph.getTokenText(edge[1])
                    features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                    features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1            
            # chains
            self.buildChains(token, sentenceGraph, features, 4)
            
            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            extra = {"xtype":"token","t":token.get("id")}
            if "split_merged" in self.styles:
                if typeString.find("---") != -1 and typeString != "Gene_expression---Positive_regulation":
                    categories = typeString.split("---")
                else:
                    categories = [category]
                for cat in categories:
                    examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),cat,features,extra) )
                    exampleIndex += 1
            else:
                examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
                exampleIndex += 1

        return examples
Пример #28
0
    def buildExamples(self, sentenceGraph):
        """
        Build one example for each token of the sentence
        """
        self.timerBuildExamples.start()
        examples = []
        exampleIndex = 0
        
        self.tokenFeatures = {}
        
        namedEntityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get("isName") == "True": # known data which can be used for features
                namedEntityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        
        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k,v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v
        
        self.timerCrawl.start()
        self.timerCrawlPrecalc.start()
        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token)
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token)
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)
        self.timerCrawl.stop()
        self.timerCrawlPrecalc.stop()
        
        self.timerMatrix.start()
        self.timerMatrixPrecalc.start()
        self._initMatrices(sentenceGraph)
        self.timerMatrix.stop()
        self.timerMatrixPrecalc.stop()
        
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token]:
                continue
            
            # CLASS
            if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
                category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            else:
                category = 1
            
            # FEATURES
            features = {}
            
            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
        
            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_"+text)] = 1
            features[self.featureSet.getId("POS_"+token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_"+stem)] = 1
            features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1
            
            # Linear order features
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)
            
            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j-1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1
            
            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_"+edgeType)] = 1
                features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_"+tokenText)] = 1
                features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_"+edgeType)] = 1
                features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_"+tokenText)] = 1
                features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1
             
            extra = {"xtype":"token","t":token.get("id")}
            examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            exampleIndex += 1
            
            # chains
            copyFeatures = copy.copy(features)
            self.timerCrawl.start()
            self.buildChains(token, sentenceGraph, features)
            self.timerCrawl.stop()
            self.timerMatrix.start()
            self.buildChainsAlternative(token, copyFeatures, sentenceGraph)
            self.timerMatrix.stop()
            diff1 = set(features.keys()) - set(copyFeatures.keys())
            diff2 = set(copyFeatures.keys()) - set(features.keys())
            if len(diff1) != 0 or len(diff2) != 0:
                print "Error for token", token.get("id"), token.get("text")
                intersection = set(features.keys()) & set(copyFeatures.keys())
                print "d1:",
                for key in sorted(diff1):
                    print self.featureSet.getName(key) + ",",
                print
                print "d2:",
                for key in sorted(diff2):
                    print self.featureSet.getName(key) + ",",
                print
                print "int:",
                intNames = []
                for key in sorted(intersection):
                    intNames.append(self.featureSet.getName(key))
                for name in sorted(intNames):
                    print name + ",",
                print
                #assert(len(diff1) == 0)
        self.timerBuildExamples.stop()
        return examples
Пример #29
0
    def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None):
        """
        Build one example for each token of the sentence
        """
        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        if goldGraph != None:
            entityToGold = EvaluateInteractionXML.mapEntities(
                sentenceGraph.entities, goldGraph.entities)

        namedEntityCount = 0
        entityCount = 0
        for entity in sentenceGraph.entities:
            if entity.get(
                    "isName"
            ) == "True":  # known data which can be used for features
                namedEntityCount += 1
            else:  # known data which can be used for features
                entityCount += 1
        namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
        entityCountFeature = "entityCount_" + str(entityCount)

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
            if len(sentenceGraph.tokenIsEntityHead) > 0:
                text = "ge_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1

            text = token.get("text")
            if self.styles["speculation_words"] and text in self.specWords:
                if not bagOfWords.has_key("spec_bow_" + text):
                    bagOfWords["spec_bow_" + text] = 0
                bagOfWords["spec_bow_" + text] += 1
                bagOfWords["spec_sentence"] = 1

        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.getInEdges(token)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.getOutEdges(token)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for entity in sentenceGraph.entities:
            #token = sentenceGraph.tokens[i]
            token = sentenceGraph.entityHeadTokenByEntity[entity]
            # Recognize only non-named entities (i.e. interaction words)
            if entity.get("isName") == "True":
                continue

            # CLASS
            if self.styles["classification"] == "multiclass":
                task3Type = "multiclass"
                categoryName = ""
                if entity.get("negation") == "True":
                    categoryName += "negation"
                if entity.get("speculation") == "True":
                    if categoryName != "":
                        categoryName += "---"
                    categoryName += "speculation"
                if categoryName == "":
                    categoryName = "neg"
                category = self.classSet.getId(categoryName)
            elif self.styles["classification"] == "speculation":
                task3Type = "speculation"
                if entity.get("speculation") == "True":
                    category = self.classSet.getId("speculation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("speculation") == "True":
                        category = self.classSet.getId("speculation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            elif self.styles["classification"] == "negation":
                task3Type = "negation"
                if entity.get("negation") == "True":
                    category = self.classSet.getId("negation")
                else:
                    category = 1
                if goldGraph != None:
                    if len(entityToGold[entity]) > 0 and entityToGold[entity][
                            0].get("negation") == "True":
                        category = self.classSet.getId("negation")
                    else:
                        category = 1
                categoryName = self.classSet.getName(category)
            self.exampleStats.beginExample(categoryName)

            # FEATURES
            features = {}

            # ENTITY TYPE
            #entityType = self.classSet.getId(self.getMergedEntityType(entity))
            #del self.classSet.Ids[self.getMergedEntityType(entity)]
            #IF LOCAL
            # There's a mistake here. The entityType should be the string, not
            # the id of the type. But there's also another issue. getMergedEntityType
            # expects a list, not an item. Therefore the type is always empty ->
            # types don't get used in classification. But this is the code used in
            # the publication, so it will now be published as is, and fixed in a later
            # release.
            #
            # Besides, using the classSet here generates an unneeded
            # additional class, that shows up in evaluations etc. However, to be
            # able to publish the exact models used for the publication experiments,
            # this can't be fixed so it breaks feature id consistency. Therefore I'll
            # now just remove the redundant class id from the classSet.
            #ENDIF
            #features[self.featureSet.getId(entityType)] = 1

            features[self.featureSet.getId(namedEntityCountFeature)] = 1
            features[self.featureSet.getId(entityCountFeature)] = 1
            #for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1

            if self.styles["speculation_words"]:
                if text in self.specWords:
                    features[self.featureSet.getId("ent_spec")] = 1
                if stem in self.specWordStems:
                    features[self.featureSet.getId("ent_spec_stem")] = 1

            # Linear order features
            for i in range(len(sentenceGraph.tokens)):
                if token == sentenceGraph.tokens[i]:
                    break
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index,
                                                  str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId(
                            "has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" +
                                                   text[j - 1:j +
                                                        1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" +
                                                   text[j - 2:j +
                                                        1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" +
                                               edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" +
                                               tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" +
                                               edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" +
                                               tokenText)] = 1

            self.buildChains(token, sentenceGraph, features)

            extra = {
                "xtype": "task3",
                "t3type": task3Type,
                "t": token.get("id"),
                "entity": entity.get("id")
            }
            #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) )
            example = (sentenceGraph.getSentenceId() + ".x" +
                       str(exampleIndex), category, features, extra)
            ExampleUtils.appendExamples([example], outfile)
            exampleIndex += 1
            self.exampleStats.endExample()
        #return examples
        return exampleIndex
Пример #30
0
    def buildFeatures(self, token, linear=True, chains=True):
        sentenceGraph = self.sentenceGraph
        tokenIndex = None
        for i in range(len(self.sentenceGraph.tokens)):
            if token == self.sentenceGraph.tokens[i]:
                tokenIndex = i
                break
        assert tokenIndex != None
        token = self.sentenceGraph.tokens[tokenIndex]
        
        #if not "names" in self.styles:
        self.setFeature(self.namedEntityCountFeature, 1)
        
        #self.features.update(self.bowFeatures) # Note! these do not get tagged
        
#            for j in range(len(sentenceGraph.tokens)):
#                text = "bow_" + sentenceGraph.tokens[j].get("text")
#                if j < i:
#                    features[self.featureSet.getId("bf_" + text)] = 1
#                elif j > i:
#                    features[self.featureSet.getId("af_" + text)] = 1
    
        # Main features
        text = token.get("text")
        self.setFeature("txt_"+text, 1)
        self.setFeature("POS_"+token.get("POS"), 1)
        stem = PorterStemmer.stem(text)
        self.setFeature("stem_"+stem, 1)
        self.setFeature("nonstem_"+text[len(stem):], 1)
        
        # Linear order features
        if linear:
            for index in [-3,-2,-1,1,2,3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index))
        
        # Content
        if i > 0 and text[0].isalpha() and text[0].isupper():
            self.setFeature("upper_case_start", 1)
        for j in range(len(text)):
            if j > 0 and text[j].isalpha() and text[j].isupper():
                self.setFeature("upper_case_middle", 1)
            # numbers and special characters
            if text[j].isdigit():
                self.setFeature("has_digits", 1)
                if j > 0 and text[j-1] == "-":
                    self.setFeature("has_hyphenated_digit", 1)
            elif text[j] == "-":
                self.setFeature("has_hyphen", 1)
            elif text[j] == "/":
                self.setFeature("has_fslash", 1)
            elif text[j] == "\\":
                self.setFeature("has_bslash", 1)
            # duplets
            if j > 0:
                self.setFeature("dt_"+text[j-1:j+1].lower(), 1)
            # triplets
            if j > 1:
                self.setFeature("tt_"+text[j-2:j+1].lower(), 1)
        
        # chains
        if chains:
            self.buildChains(token, sentenceGraph)
Пример #31
0
    def buildExamplesInner(self, sentenceGraph, goldGraph):
        """
        Build one example for each token of the sentence
        """
        if sentenceGraph.sentenceElement.get("origId") in self.skiplist:
            print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId")
            return []

        self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True)
        self.triggerFeatureBuilder.initSentence(sentenceGraph)

        undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph)
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Get argument order
        self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths)
        self.interactionLengths = self.interactionLengths.values()
        self.interactionLengths.sort(compareInteractionPrecedence)
        # Map tokens to entities
        tokenByOffset = {}
        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            if goldGraph != None:
                goldToken = goldGraph.tokens[i]
                assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset")
            tokenByOffset[token.get("charOffset")] = token.get("id")
        # Map gold entities to their head offsets
        goldEntitiesByOffset = {}
        for token in sentenceGraph.tokens:
            goldEntitiesByOffset[token.get("charOffset")] = []
        entityToGold = {}
        for entity in sentenceGraph.entities:
            entityToGold[entity] = []
        if goldGraph != None:
            for entity in goldGraph.entities:
                offset = entity.get("headOffset")
                assert offset != None
                goldEntitiesByOffset[offset].append(entity)
            # Map predicted entities to gold entities
            for entity in sentenceGraph.entities:
                eType = entity.get("type")
                eOffset = entity.get("headOffset")
                for goldEntity in goldEntitiesByOffset[eOffset]:
                    if goldEntity.get("type") == eType:
                        entityToGold[entity].append(goldEntity)
        # Map entities to interactions
        # interactionsByEntityId = {}
        # for entity in sentenceGraph.entities:
        #    interactionsByEntityId[entity.get("id")] = []
        # Map tokens to interactions
        interactionsByToken = {}
        for token in sentenceGraph.tokens:
            interactionsByToken[token] = []
        for interactionTuple in self.interactionLengths:
            interaction = interactionTuple[0]
            if interaction.get("type") == "neg":
                continue
            e1Id = interaction.get("e1")
            token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]]
            interactionsByToken[token].append(interaction)

        examples = []
        exampleIndex = 0

        self.tokenFeatures = {}

        # namedEntityNorStrings = set()
        namedEntityHeadTokens = []
        if not "names" in self.styles:
            namedEntityCount = 0
            for entity in sentenceGraph.entities:
                if entity.get("isName") == "True":  # known data which can be used for features
                    namedEntityCount += 1
                    # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() )
            namedEntityCountFeature = "nameCount_" + str(namedEntityCount)
            # if namedEntityCount == 0: # no names, no need for triggers
            #    return []

            if "pos_pairs" in self.styles:
                namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph)

        # neFeatures = {} # F: 69.35 -> 69.14
        # for norString in namedEntityNorStrings:
        #    neFeatures[self.featureSet.getId("norNE_" + norString)] = 1

        bagOfWords = {}
        for token in sentenceGraph.tokens:
            text = "bow_" + token.get("text")
            if not bagOfWords.has_key(text):
                bagOfWords[text] = 0
            bagOfWords[text] += 1
            if sentenceGraph.tokenIsName[token]:
                text = "ne_" + text
                if not bagOfWords.has_key(text):
                    bagOfWords[text] = 0
                bagOfWords[text] += 1
        bowFeatures = {}
        for k, v in bagOfWords.iteritems():
            bowFeatures[self.featureSet.getId(k)] = v

        self.inEdgesByToken = {}
        self.outEdgesByToken = {}
        self.edgeSetByToken = {}
        for token in sentenceGraph.tokens:
            inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True)
            fixedInEdges = []
            for edge in inEdges:
                fixedInEdges.append((edge[0], edge[1], edge[2]["element"]))
            inEdges = fixedInEdges
            inEdges.sort(compareDependencyEdgesById)
            self.inEdgesByToken[token] = inEdges
            outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True)
            fixedOutEdges = []
            for edge in outEdges:
                fixedOutEdges.append((edge[0], edge[1], edge[2]["element"]))
            outEdges = fixedOutEdges
            outEdges.sort(compareDependencyEdgesById)
            self.outEdgesByToken[token] = outEdges
            self.edgeSetByToken[token] = set(inEdges + outEdges)

        for i in range(len(sentenceGraph.tokens)):
            token = sentenceGraph.tokens[i]
            # Recognize only non-named entities (i.e. interaction words)
            if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles:
                continue

            # CLASS
            # if len(sentenceGraph.tokenIsEntityHead[token]) > 0:
            #    category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]))
            # else:
            #    category = 1
            offset = token.get("charOffset")
            if len(goldEntitiesByOffset[offset]) > 0:
                category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset]))
            else:
                category = 1

            tokenText = token.get("text").lower()
            if "stem_gazetteer" in self.styles:
                tokenText = PorterStemmer.stem(tokenText)
            if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer:
                features = {}
                features[self.featureSet.getId("exclude_gazetteer")] = 1
                extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"}
                examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
                exampleIndex += 1
                continue

            # FEATURES
            features = {}
            self.features = features

            if not "names" in self.styles:
                features[self.featureSet.getId(namedEntityCountFeature)] = 1
            # for k,v in bagOfWords.iteritems():
            #    features[self.featureSet.getId(k)] = v
            # pre-calculate bow _features_
            features.update(bowFeatures)
            # features.update(neFeatures)

            #            for j in range(len(sentenceGraph.tokens)):
            #                text = "bow_" + sentenceGraph.tokens[j].get("text")
            #                if j < i:
            #                    features[self.featureSet.getId("bf_" + text)] = 1
            #                elif j > i:
            #                    features[self.featureSet.getId("af_" + text)] = 1

            # Main features
            text = token.get("text")
            features[self.featureSet.getId("txt_" + text)] = 1
            features[self.featureSet.getId("POS_" + token.get("POS"))] = 1
            stem = PorterStemmer.stem(text)
            features[self.featureSet.getId("stem_" + stem)] = 1
            features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1

            # Normalized versions of the string (if same as non-normalized, overlap without effect)
            normalizedText = (
                text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower()
            )
            if normalizedText == "bound":  # should be for all irregular verbs
                normalizedText = "bind"
            features[self.featureSet.getId("txt_" + normalizedText)] = 1
            norStem = PorterStemmer.stem(normalizedText)
            features[self.featureSet.getId("stem_" + norStem)] = 1
            features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1

            if "gazetteer_features_maintoken" in self.styles:
                tokTxtLower = text.lower()
                if "stem_gazetteer" in self.styles:
                    tokTxtLower = PorterStemmer.stem(tokTxtLower)
                if self.gazetteer and tokTxtLower in self.gazetteer:
                    for label, weight in self.gazetteer[tokTxtLower].items():
                        features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight  # 1 performs slightly worse

            # Linear order features
            # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97
            for index in [-3, -2, -1, 1, 2, 3]:
                if i + index > 0 and i + index < len(sentenceGraph.tokens):
                    self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features)

            # Content
            if i > 0 and text[0].isalpha() and text[0].isupper():
                features[self.featureSet.getId("upper_case_start")] = 1
            for j in range(len(text)):
                if j > 0 and text[j].isalpha() and text[j].isupper():
                    features[self.featureSet.getId("upper_case_middle")] = 1
                # numbers and special characters
                if text[j].isdigit():
                    features[self.featureSet.getId("has_digits")] = 1
                    if j > 0 and text[j - 1] == "-":
                        features[self.featureSet.getId("has_hyphenated_digit")] = 1
                elif text[j] == "-":
                    features[self.featureSet.getId("has_hyphen")] = 1
                elif text[j] == "/":
                    features[self.featureSet.getId("has_fslash")] = 1
                elif text[j] == "\\":
                    features[self.featureSet.getId("has_bslash")] = 1
                # duplets
                if j > 0:
                    features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1
                # triplets
                if j > 1:
                    features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1

            # Attached edges (Hanging in and out edges)
            t1InEdges = self.inEdgesByToken[token]
            for edge in t1InEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HIn_" + edgeType)] = 1
                features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[0])
                features[self.featureSet.getId("t1HIn_" + tokenText)] = 1
                features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1
            t1OutEdges = self.outEdgesByToken[token]
            for edge in t1OutEdges:
                edgeType = edge[2].get("type")
                features[self.featureSet.getId("t1HOut_" + edgeType)] = 1
                features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1
                tokenText = sentenceGraph.getTokenText(edge[1])
                features[self.featureSet.getId("t1HOut_" + tokenText)] = 1
                features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1

            extra = {"xtype": "token", "t": token.get("id")}
            examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra))
            exampleIndex += 1

            # chains
            self.buildChains(token, sentenceGraph, features)

            if "pos_pairs" in self.styles:
                self.buildPOSPairs(token, namedEntityHeadTokens, features)

            self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token])
        return examples