def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt=sentenceGraph.getTokenText(token) features = {} features["_txt_"+tokTxt]=1 features["_POS_"+token.get("POS")]=1 if self.styles["speculation_words"]: if tokTxt in self.specWords: features["_spec"]=1 features["_spec_"+tokTxt]=1 tokStem = PorterStemmer.stem(tokTxt) if tokStem in self.specWordStems: features["_spec_stem"]=1 features["_spec_stem_"+tokStem]=1 if sentenceGraph.tokenIsName[token]: features["_isName"]=1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_"+entity.get("type")]=1 if self.gazetteer and tokTxt.lower() in self.gazetteer: for label,weight in self.gazetteer[tokTxt.lower()].items(): pass #features["_knownLabel_"+label]=weight self.tokenFeatures[token] = features return features
def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt=sentenceGraph.getTokenText(token) features = {} features["_txt_"+tokTxt]=1 features["_POS_"+token.get("POS")]=1 if sentenceGraph.tokenIsName[token]: features["_isName"]=1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_"+entity.get("type")]=1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label,weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_"+label]=weight # 1 performs slightly worse self.tokenFeatures[token] = features return features
def getGazetteerMatch(self, string): if string in self.gazMatchCache: return self.gazMatchCache[string] origString = string if "stem_gazetteer" in self.styles: string = PorterStemmer.stem(string) if string in self.gazetteer: self.gazMatchCache[origString] = string return string elif string.find("-") != -1: replaced = string.replace("-", "") else: self.gazMatchCache[origString] = None return None if replaced in self.gazetteer: self.gazMatchCache[origString] = replaced return replaced else: splitted = string.rsplit("-", 1)[-1] if splitted in self.gazetteer: self.gazMatchCache[origString] = splitted return splitted else: self.gazMatchCache[origString] = None return None
def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 # F 69.35 -> 68.22 #normalizedText = tokTxt.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() #features["_norTxt_"+normalizedText]=1 #features["_norStem_" + PorterStemmer.stem(normalizedText)]=1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 # Filip's gazetteer based features (can be used separately from exclude_gazetteer) if "gazetteer_features" in self.styles: tokTxtLower = tokTxt.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features["_knownLabel_" + label] = weight # 1 performs slightly worse self.tokenFeatures[token] = features return features
def getGazetteerMatch(self, string): if string in self.gazMatchCache: return self.gazMatchCache[string] origString = string if "stem_gazetteer" in self.styles: string = PorterStemmer.stem(string) if string in self.gazetteer: self.gazMatchCache[origString] = string return string elif string.find("-") != -1: replaced = string.replace("-","") else: self.gazMatchCache[origString] = None return None if replaced in self.gazetteer: self.gazMatchCache[origString] = replaced return replaced else: splitted = string.rsplit("-",1)[-1] if splitted in self.gazetteer: self.gazMatchCache[origString] = splitted return splitted else: self.gazMatchCache[origString] = None return None
def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 features["_POS_" + token.get("POS")] = 1 if self.styles["speculation_words"]: if tokTxt in self.specWords: features["_spec"] = 1 features["_spec_" + tokTxt] = 1 tokStem = PorterStemmer.stem(tokTxt) if tokStem in self.specWordStems: features["_spec_stem"] = 1 features["_spec_stem_" + tokStem] = 1 if sentenceGraph.tokenIsName[token]: features["_isName"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("isName") == "True": features["_annType_" + entity.get("type")] = 1 if self.gazetteer and tokTxt.lower() in self.gazetteer: for label, weight in self.gazetteer[tokTxt.lower()].items(): pass #features["_knownLabel_"+label]=weight self.tokenFeatures[token] = features return features
def printStats(self): eventsByType = {} for event in self.eventsByOrigId.values(): eventsByType[event.get("type")] = eventsByType.get(event.get("type"),0) + 1 f = open("missed-events", "wt") missedEvents = {} for key in self.examplesByEventOrigId.keys(): if self.examplesByEventOrigId[key] == 0: if not missedEvents.has_key(self.eventsByOrigId[key].get("type")): missedEvents[self.eventsByOrigId[key].get("type")] = [] missedEvents[self.eventsByOrigId[key].get("type")].append(key) for key in sorted(missedEvents.keys()): f.write(key + "\n") for id in sorted(missedEvents[key]): f.write(" " + id + " ") if id in self.interSentenceEvents: f.write("intersentence ") text = self.headTokensByOrigId[id].get("text").lower() if not self.isInGazetteer(text): text = self.headTokensByOrigId[id].get("text").lower() if "stem_gazetteer" in self.styles: stemmed = PorterStemmer.stem(text) f.write("not-in-gazetteer (" + text + " / " + stemmed +")" ) f.write("\n") f.close() print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)" for key in sorted(eventsByType.keys()): inter = 0 other = 0 nongaz = 0 if missedEvents.has_key(key): for id in missedEvents[key]: tokText = self.headTokensByOrigId[id].get("text").lower() if id in self.interSentenceEvents: inter += 1 elif not self.isInGazetteer(tokText): nongaz += 1 else: other += 1 if inter == other == nongaz == 0: print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): missed none" else: print >> sys.stderr, " " + key + " (" + str(eventsByType[key]) + "): " + str(other) + ", " + str(inter) + ", " + str(nongaz) print >> sys.stderr, "Example generation (total, built/skipped)" for key in sorted(list(set(self.skippedByType.keys() + self.builtByType.keys()))): string = " " + key + ": (" + str(self.builtByType.get(key,0)+self.skippedByType.get(key,0)) + ", " + str(self.builtByType.get(key,0)) + "/" + str(self.skippedByType.get(key,0)) + ") [" for key2 in sorted(self.skippedByTypeAndReason[key].keys()): string += key2 + ":" + str(self.skippedByTypeAndReason[key][key2]) + " " string += "]" print >> sys.stderr, string
def mapSplits(splits, string, stringOffset): """ Maps substrings to a string, and stems them """ begin = 0 tuples = [] for split in splits: offset = string.find(split, begin) assert offset != -1 tuples.append( (split, PorterStemmer.stem(split), (offset,len(split))) ) begin = offset + len(split) return tuples
def mapSplits(splits, string, stringOffset): """ Maps substrings to a string, and stems them """ begin = 0 tuples = [] for split in splits: offset = string.find(split, begin) assert offset != -1 tuples.append((split, PorterStemmer.stem(split), (offset, len(split)))) begin = offset + len(split) return tuples
def readWords(words): if type(words) in types.StringTypes: wordSet = set() f = open(filename) for line in f.readlines(): wordSet.add(line.strip()) f.close() else: # assume it's a list wordSet = set(words) stemSet = set() for word in wordSet: stemSet.add(PorterStemmer.stem(word)) return wordSet, stemSet
def getTriggers(corpus): """ Returns a dictionary of "entity type"->"entity text"->"count" """ corpus = ETUtils.ETFromObj(corpus) trigDict = {} for entity in corpus.getroot().getiterator("entity"): if entity.get("isName") == "True": continue eType = entity.get("type") if not trigDict.has_key(eType): trigDict[eType] = {} eText = entity.get("text") eText = PorterStemmer.stem(eText) if not trigDict[eType].has_key(eText): trigDict[eType][eText] = 0 trigDict[eType][eText] += 1 return trigDict
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_"+text): bagOfWords["spec_bow_"+text] = 0 bagOfWords["spec_bow_"+text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("isName") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
res = re.sub(r'[^A-Za-z]', " ", txt1) # Removes numbers and special characters res1 = res.lower() # Lower-case the tokens tokens = res1.split() doclen = len(tokens) doclst.append((index, doclen)) # tokens = word_tokenize(res1) token.extend(tokens) # if (len(tokens) > 0): # print(tokens) # folder = os.listdir(path) f_token = [w for w in tokens if w not in stopwords.words('english')] #---------STEMS---------# ps = PorterStemmer() stem = [] for ft in f_token: stem.append(ps.stem(ft, 0, len(ft) - 1)) stemc = collections.Counter(stem) max_tfstem = stemc.most_common(1) for key, value in max_tfstem: max_tfs = value for t, tf in stemc.items(): dst = [index, tf, max_tfs, doclen] slist = [] if t in ps_stem: ps_stem[t].append(dst) else:
def printStats(self): eventsByType = {} for event in self.eventsByOrigId.values(): eventsByType[event.get("type")] = eventsByType.get( event.get("type"), 0) + 1 f = open("missed-events", "wt") missedEvents = {} for key in self.examplesByEventOrigId.keys(): if self.examplesByEventOrigId[key] == 0: if not missedEvents.has_key( self.eventsByOrigId[key].get("type")): missedEvents[self.eventsByOrigId[key].get("type")] = [] missedEvents[self.eventsByOrigId[key].get("type")].append(key) for key in sorted(missedEvents.keys()): f.write(key + "\n") for id in sorted(missedEvents[key]): f.write(" " + id + " ") if id in self.interSentenceEvents: f.write("intersentence ") text = self.headTokensByOrigId[id].get("text").lower() if not self.isInGazetteer(text): text = self.headTokensByOrigId[id].get("text").lower() if "stem_gazetteer" in self.styles: stemmed = PorterStemmer.stem(text) f.write("not-in-gazetteer (" + text + " / " + stemmed + ")") f.write("\n") f.close() print >> sys.stderr, "Example selection missed events (other, intersentence, non-gazetteer)" for key in sorted(eventsByType.keys()): inter = 0 other = 0 nongaz = 0 if missedEvents.has_key(key): for id in missedEvents[key]: tokText = self.headTokensByOrigId[id].get("text").lower() if id in self.interSentenceEvents: inter += 1 elif not self.isInGazetteer(tokText): nongaz += 1 else: other += 1 if inter == other == nongaz == 0: print >> sys.stderr, " " + key + " (" + str( eventsByType[key]) + "): missed none" else: print >> sys.stderr, " " + key + " (" + str( eventsByType[key]) + "): " + str(other) + ", " + str( inter) + ", " + str(nongaz) print >> sys.stderr, "Example generation (total, built/skipped)" for key in sorted( list(set(self.skippedByType.keys() + self.builtByType.keys()))): string = " " + key + ": (" + str( self.builtByType.get(key, 0) + self.skippedByType.get(key, 0)) + ", " + str( self.builtByType.get(key, 0)) + "/" + str( self.skippedByType.get(key, 0)) + ") [" for key2 in sorted(self.skippedByTypeAndReason[key].keys()): string += key2 + ":" + str( self.skippedByTypeAndReason[key][key2]) + " " string += "]" print >> sys.stderr, string
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if sentenceGraph.tokenIsName[token]: category = 1 else: category = -1 # FEATURES features = {} # Main features text = token.attrib["text"] features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.attrib["POS"])] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Linear order features if i > 0: features[self.featureSet.getId( "linear_-1_txt_" + sentenceGraph.tokens[i - 1].attrib["text"])] = 1 features[self.featureSet.getId( "linear_-1_POS_" + sentenceGraph.tokens[i - 1].attrib["POS"])] = 1 if i > 1: features[self.featureSet.getId( "linear_-2_txt_" + sentenceGraph.tokens[i - 2].attrib["text"])] = 1 features[self.featureSet.getId( "linear_-2_POS_" + sentenceGraph.tokens[i - 2].attrib["POS"])] = 1 if i < len(sentenceGraph.tokens) - 1: features[self.featureSet.getId( "linear_+1_txt_" + sentenceGraph.tokens[i + 1].attrib["text"])] = 1 features[self.featureSet.getId( "linear_+1_POS_" + sentenceGraph.tokens[i + 1].attrib["POS"])] = 1 if i < len(sentenceGraph.tokens) - 2: features[self.featureSet.getId( "linear_+2_txt_" + sentenceGraph.tokens[i + 2].attrib["text"])] = 1 features[self.featureSet.getId( "linear_+2_POS_" + sentenceGraph.tokens[i + 2].attrib["POS"])] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges t1InEdges = sentenceGraph.dependencyGraph.in_edges(token) for edge in t1InEdges: features[self.featureSet.getId("t1HangingIn_" + edge[2].attrib["type"])] = 1 features[self.featureSet.getId("t1HangingIn_" + edge[0].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingIn_" + edge[0].attrib["text"])] = 1 t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token) for edge in t1OutEdges: features[self.featureSet.getId("t1HangingOut_" + edge[2].attrib["type"])] = 1 features[self.featureSet.getId("t1HangingOut_" + edge[1].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingOut_" + edge[1].attrib["text"])] = 1 extra = {"xtype": "token", "t": token} examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 return examples
def tokTxt(b, e, sNode, stem=False): if stem: return PorterStemmer.stem(sNode.get("text")[b:e + 1]) else: return sNode.get("text")[b:e + 1]
def buildExamples(self, sentenceGraph, exampleIndex = 0): examples = [] #exampleIndex = 0 namedEntityCount = 0 for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if sentenceGraph.tokenIsName[token]: namedEntityCount += 1 for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token]: continue if sentenceGraph.tokenIsEntityHead[token] != None: # CLASS category = 1 else: category = -1 # FEATURES features = {} # Main features textUpper = token.get("text") text = textUpper.lower() features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Dictionary features if text in intWords: features[self.featureSet.getId("dict")] = 1 features[self.featureSet.getId("dict_def_"+wordDict[text])]=1 # Named entity count features[self.featureSet.getId("neCount")] = namedEntityCount # Linear order features self.entityFeatureBuilder.setFeatureVector(features) self.entityFeatureBuilder.buildLinearOrderFeatures(i, sentenceGraph, 3, 3 ) # Content self.entityFeatureBuilder.buildContentFeatures(i, textUpper, duplets=True, triplets=True) self.entityFeatureBuilder.setFeatureVector(None) # Attached edges self.edgeFeatureBuilder.setFeatureVector(features) t1InEdges = sentenceGraph.dependencyGraph.in_edges(token) for edge in t1InEdges: self.edgeFeatureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in_", text=True, POS=True, annType=False, maskNames=True) # l2Edges = sentenceGraph.dependencyGraph.in_edges(edge[0]) # for e2 in l2Edges: # self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in2_", text=True, POS=True, annType=False, maskNames=True) # l2Edges = sentenceGraph.dependencyGraph.out_edges(edge[0]) # for e2 in l2Edges: # self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "in2_", text=True, POS=True, annType=False, maskNames=True) #self.featureBuilder.buildAttachedEdgeFeatures(edge, sentenceGraph, "in_att_", text=True, POS=True, annType=False, maskNames=True) #self.featureBuilder.buildLinearOrderFeatures(edge) t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token) for edge in t1OutEdges: self.edgeFeatureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out_", text=True, POS=True, annType=False, maskNames=True) # l2Edges = sentenceGraph.dependencyGraph.in_edges(edge[1]) # for e2 in l2Edges: # self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out2_", text=True, POS=True, annType=False, maskNames=True) # l2Edges = sentenceGraph.dependencyGraph.out_edges(edge[1]) # for e2 in l2Edges: # self.featureBuilder.buildEdgeFeatures(edge, sentenceGraph, "out2_", text=True, POS=True, annType=False, maskNames=True) #self.featureBuilder.buildAttachedEdgeFeatures(edge, sentenceGraph, "out_att_", text=True, POS=True, annType=False, maskNames=True) #self.featureBuilder.buildLinearOrderFeatures(edge) self.edgeFeatureBuilder.setFeatureVector(None) extra = {"xtype":"token","t":token} examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 return examples
def tokTxt(b, e, sNode, stem=False): if stem: return PorterStemmer.stem(sNode.get("text")[b : e + 1]) else: return sNode.get("text")[b : e + 1]
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not self.styles[ "build_for_nameless"]: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType( sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles[ "names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_" + stringLower)] = 1 features[self.featureSet.getId( "substringstem_" + PorterStemmer.stem(stringLower))] = 1 # Linear order features for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i - 1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i - 2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_" + text[0:2].lower())] = 1 features[self.featureSet.getId("begin_" + text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures( tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_" + wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures( token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) extra = {"xtype": "token", "t": token.get("id")} if self.styles["bb_features"]: extra[ "trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra[ "goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # chains self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not self.styles["build_for_nameless"]: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_"+normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_"+norStem)] = 1 features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_"+stringLower)] = 1 features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1 # Linear order features for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1 features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_"+wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) extra = {"xtype":"token","t":token.get("id")} if self.styles["bb_features"]: extra["trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra["goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # chains self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def stemTokens(self): for token in self.tokensById.values(): token.stem = stemmer.stem(token.text)
def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions #interactionsByEntityId = {} #for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[ sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} #namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) #if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) #neFeatures = {} # F: 69.35 -> 69.14 #for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[ token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS #if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) #else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId( self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles ) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = { "xtype": "token", "t": token.get("id"), "excluded": "True" } examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) #features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId( "gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples
30) #Identifies the 30 most frequent tokens for i in freq: print(i) print("Average number of word tokens per document:", round(count / len(files))) stop = datetime.now() # Code Execution End Time print("Time the program took to acquire the text characteristics:") print(stop - start) ##----------------STEMMING----------------------## k = 0 c = 0 pslist = [] ps = PorterStemmer() # Uses PorterStemmer class defined in Stemming.py for i in token: pslist.append(ps.stem(i, 0, len(i) - 1)) c += 1 #print(set(pslist)) # Identifies the unique stems print("##----------------STEMMING---------------------##") print("Number of distinct stems in the Cranfield text collection:", len(set(pslist))) w2 = Counter(pslist) # Identifies the stems which occured only once for i in pslist: if w2[i] == 1: once = 1 k += 1 else:
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if sentenceGraph.tokenIsName[token]: category = 1 else: category = -1 # FEATURES features = {} # Main features text = token.attrib["text"] features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.attrib["POS"])] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Linear order features if i > 0: features[self.featureSet.getId("linear_-1_txt_"+sentenceGraph.tokens[i-1].attrib["text"])] = 1 features[self.featureSet.getId("linear_-1_POS_"+sentenceGraph.tokens[i-1].attrib["POS"])] = 1 if i > 1: features[self.featureSet.getId("linear_-2_txt_"+sentenceGraph.tokens[i-2].attrib["text"])] = 1 features[self.featureSet.getId("linear_-2_POS_"+sentenceGraph.tokens[i-2].attrib["POS"])] = 1 if i < len(sentenceGraph.tokens) - 1: features[self.featureSet.getId("linear_+1_txt_"+sentenceGraph.tokens[i+1].attrib["text"])] = 1 features[self.featureSet.getId("linear_+1_POS_"+sentenceGraph.tokens[i+1].attrib["POS"])] = 1 if i < len(sentenceGraph.tokens) - 2: features[self.featureSet.getId("linear_+2_txt_"+sentenceGraph.tokens[i+2].attrib["text"])] = 1 features[self.featureSet.getId("linear_+2_POS_"+sentenceGraph.tokens[i+2].attrib["POS"])] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges t1InEdges = sentenceGraph.dependencyGraph.in_edges(token) for edge in t1InEdges: features[self.featureSet.getId("t1HangingIn_"+edge[2].attrib["type"])] = 1 features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingIn_"+edge[0].attrib["text"])] = 1 t1OutEdges = sentenceGraph.dependencyGraph.out_edges(token) for edge in t1OutEdges: features[self.featureSet.getId("t1HangingOut_"+edge[2].attrib["type"])] = 1 features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["POS"])] = 1 features[self.featureSet.getId("t1HangingOut_"+edge[1].attrib["text"])] = 1 extra = {"xtype":"token","t":token} examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 return examples
def buildExamples(self, sentenceGraph): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 continue if ("filter_pos" in self.styles) and token.get("POS") in self.excludedPOS: features = {} features[self.featureSet.getId("filter_pos")] = 1 extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 continue # FEATURES features = {} if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Linear order features for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) if not "no_hanging" in self.styles: t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 # chains self.buildChains(token, sentenceGraph, features, 4) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) extra = {"xtype":"token","t":token.get("id")} if "split_merged" in self.styles: if typeString.find("---") != -1 and typeString != "Gene_expression---Positive_regulation": categories = typeString.split("---") else: categories = [category] for cat in categories: examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),cat,features,extra) ) exampleIndex += 1 else: examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 return examples
def buildExamples(self, sentenceGraph): """ Build one example for each token of the sentence """ self.timerBuildExamples.start() examples = [] exampleIndex = 0 self.tokenFeatures = {} namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.timerCrawl.start() self.timerCrawlPrecalc.start() self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token) inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token) outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) self.timerCrawl.stop() self.timerCrawlPrecalc.stop() self.timerMatrix.start() self.timerMatrixPrecalc.start() self._initMatrices(sentenceGraph) self.timerMatrix.stop() self.timerMatrixPrecalc.stop() for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token]: continue # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) else: category = 1 # FEATURES features = {} features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Linear order features for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 extra = {"xtype":"token","t":token.get("id")} examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) exampleIndex += 1 # chains copyFeatures = copy.copy(features) self.timerCrawl.start() self.buildChains(token, sentenceGraph, features) self.timerCrawl.stop() self.timerMatrix.start() self.buildChainsAlternative(token, copyFeatures, sentenceGraph) self.timerMatrix.stop() diff1 = set(features.keys()) - set(copyFeatures.keys()) diff2 = set(copyFeatures.keys()) - set(features.keys()) if len(diff1) != 0 or len(diff2) != 0: print "Error for token", token.get("id"), token.get("text") intersection = set(features.keys()) & set(copyFeatures.keys()) print "d1:", for key in sorted(diff1): print self.featureSet.getName(key) + ",", print print "d2:", for key in sorted(diff2): print self.featureSet.getName(key) + ",", print print "int:", intNames = [] for key in sorted(intersection): intNames.append(self.featureSet.getName(key)) for name in sorted(intNames): print name + ",", print #assert(len(diff1) == 0) self.timerBuildExamples.stop() return examples
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_" + text): bagOfWords["spec_bow_" + text] = 0 bagOfWords["spec_bow_" + text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("isName") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = { "xtype": "task3", "t3type": task3Type, "t": token.get("id"), "entity": entity.get("id") } #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildFeatures(self, token, linear=True, chains=True): sentenceGraph = self.sentenceGraph tokenIndex = None for i in range(len(self.sentenceGraph.tokens)): if token == self.sentenceGraph.tokens[i]: tokenIndex = i break assert tokenIndex != None token = self.sentenceGraph.tokens[tokenIndex] #if not "names" in self.styles: self.setFeature(self.namedEntityCountFeature, 1) #self.features.update(self.bowFeatures) # Note! these do not get tagged # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") self.setFeature("txt_"+text, 1) self.setFeature("POS_"+token.get("POS"), 1) stem = PorterStemmer.stem(text) self.setFeature("stem_"+stem, 1) self.setFeature("nonstem_"+text[len(stem):], 1) # Linear order features if linear: for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index)) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): self.setFeature("upper_case_start", 1) for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): self.setFeature("upper_case_middle", 1) # numbers and special characters if text[j].isdigit(): self.setFeature("has_digits", 1) if j > 0 and text[j-1] == "-": self.setFeature("has_hyphenated_digit", 1) elif text[j] == "-": self.setFeature("has_hyphen", 1) elif text[j] == "/": self.setFeature("has_fslash", 1) elif text[j] == "\\": self.setFeature("has_bslash", 1) # duplets if j > 0: self.setFeature("dt_"+text[j-1:j+1].lower(), 1) # triplets if j > 1: self.setFeature("tt_"+text[j-2:j+1].lower(), 1) # chains if chains: self.buildChains(token, sentenceGraph)
def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} # namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) # neFeatures = {} # F: 69.35 -> 69.14 # for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS # if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) # else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 # for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = ( text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower() ) if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples