def buildExamplesForDocuments(self, documentSentences, output, idFileTag=None): examples = [] counter = ProgressCounter(len(documentSentences), "Build examples") #calculatePredictedRange(self, sentences) outfile = open(output, "wt") exampleCount = 0 for document in documentSentences: counter.update( 1, "Building examples (" + document[0].sentence.get("id") + "): ") examples = self.buildExamples(document) exampleCount += len(examples) #examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >>sys.stderr, "Examples built:", exampleCount print >>sys.stderr, "Features:", len(self.featureSet.getNames()) # IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # ENDIF # Save Ids if idFileTag != None: print >>sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example (" + example[0] + "): ") features = example[2] for i in range(len(weightFeatures) - 1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example ("+example[0]+"): ") features = example[2] for i in range(len(weightFeatures)-1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped buildForNameless = False if structureAnalyzer and not structureAnalyzer.hasGroupClass( "GIVEN", "ENTITY" ): # no given entities points to no separate NER program being used buildForNameless = True if self.styles["build_for_nameless"]: # manually force the setting buildForNameless = True if self.styles["skip_for_nameless"]: # manually force the setting buildForNameless = False # determine whether sentences with no given entities should be skipped namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "given" ) == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) else: for key in sentenceGraph.tokenIsName.keys(): sentenceGraph.tokenIsName[key] = False bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType( sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles[ "names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) if category == None: self.exampleStats.filter("undefined_class") self.exampleStats.endExample() continue tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_" + stringLower)] = 1 features[self.featureSet.getId( "substringstem_" + PorterStemmer.stem(stringLower))] = 1 if not self.styles["no_context"]: # Linear order features for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i - 1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i - 2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_" + text[0:2].lower())] = 1 features[self.featureSet.getId("begin_" + text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) if not self.styles["no_context"]: t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) # DDI13 features if self.styles["ddi13_features"]: for index in range(len(normalizedText)): features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index + 1])] = 1 features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildDrugFeatures(token) self.drugFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures( tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_" + wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures( token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token) self.ontobiotopeFeatureBuilder.setFeatureVector(None) extra = {"xtype": "token", "t": token.get("id")} if self.styles["bb_features"]: extra[ "trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra[ "goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) if self.styles["bb_spans"]: for span in sentenceGraph.sentenceElement.iter("span"): if span.get("headOffset") != token.get("charOffset"): continue #if span.get("source") != "spec": # continue #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id") features[self.featureSet.getId("span_found")] = 1 features[self.featureSet.getId( "span_count")] = 1 + features.get( self.featureSet.getId("span_count"), 0) features[self.featureSet.getId("span_identifier" + span.get("identifier"))] = 1 features[self.featureSet.getId("span_type" + span.get("type"))] = 1 features[self.featureSet.getId("span_category" + span.get("category"))] = 1 features[self.featureSet.getId("span_source" + span.get("source"))] = 1 if "define_offset" in extra: prevOffset = [ int(x) for x in extra["define_offset"].split("-") ] assert len(prevOffset) == 2 newOffset = [ int(x) for x in span.get("charOffset").split("-") ] assert len(newOffset) == 2 prevOffsetRange = abs(prevOffset[0] - prevOffset[1]) newOffsetRange = abs(newOffset[0] - newOffset[1]) if newOffsetRange > prevOffsetRange: extra["define_offset"] = span.get("charOffset") else: extra["define_offset"] = span.get("charOffset") features[self.featureSet.getId("span_count_" + str( features.get(self.featureSet.getId("span_count"), 0)))] = 1 # chains if not self.styles["no_context"]: self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features) self.wordVectorFeatureBuilder.buildFeatures(token) self.wordVectorFeatureBuilder.setFeatureVector(None) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: # Entered here - Mu # The entities here include both named entities(Protein) and event triggers # The purpose of merging the entities is to convert the original gold annotation, where # a trigger can have multiple trigger annotations, to the merged version. mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # pdb.set_trace() # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities): # pdb.set_trace() # Up to here, the merged graph has been built. for one sentence - Mu # sentenceGraph_return = sentenceGraph # with open('./GE09_train_graph/merged-'+ sentenceGraph.sentenceElement.get('id'), 'wb') as f: # pickle.dump(sentenceGraph, f) # with open('./GE09_train_graph/gold-'+ goldGraph.sentenceElement.get('id'), 'wb') as f: # pickle.dump(goldGraph, f) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: # interactions are outgoing edges for the current entity - Mu if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) # pdb.set_trace() #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) # pdb.set_trace() # if 'Theme' in validInteractionsByType.keys() and 'Cause' in validInteractionsByType: # pdb.set_trace() for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[(), (d,)], [(a,), (b,)]] - Mu # pdb.set_trace() # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) # Up to here, all possible interaction combinations are found - Mu # Note this is for each trigger - Mu #sum(argCombinations, []) # flatten nested list argCombinations_return = argCombinations if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations # if len(sentenceGraph.entities) != len(sentenceGraph.mergedEntities) and len(argCombinations) != 0: # if sentenceGraph.sentenceElement.get('id') == 'GE09.d167.s1': # pdb.set_trace() for argCombination in argCombinations: # Originally binary classification # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']: # maxArgCombinationLen = max([len(i) for i in argCombinations]) # if len(argCombination) != maxArgCombinationLen: # # meaning that for Regulation classes, there are plausible association of both # # (Theme, Cause) and (Theme). And we always choose (Theme, Cause) and ignore (Theme) # continue # if entity.get('type') in ['Binding']: # maxArgCombinationLen = max([len(i) for i in argCombinations]) # if len(argCombination) != maxArgCombinationLen: # # meaning that for binding events, only take the longest ones. # continue # if entity.get('type') in ['Localization', 'Phosphorylation']: # maxArgCombinationLen = max([len(i) for i in argCombinations]) # if len(argCombination) != maxArgCombinationLen: # # meaning that for binding events, only take the longest ones. # continue # if entity.get('type') in ['Negative_regulation', 'Positive_regulation', 'Regulation']: # if entity.get('type') in ['Binding']: if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: # Entered here, since self.styles["binary"] is None - Mu category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues): if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 else: # not a valid event or valid entity if len(issues) == 0: # must be > 0 so that it gets filtered if not structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:"+eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex#, sentenceGraph_return, argCombinations_return
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each phrase in the sentence """ self.triggerFeatureBuilder.initSentence(sentenceGraph) #examples = [] exampleIndex = 0 # Prepare phrases, create subphrases #filter = set(["NP", "TOK-IN", "WHADVP", "WHNP", "TOK-WP$", "TOK-PRP$", "NP-IN"]) phrases = MapPhrases.getPhrases(sentenceGraph.parseElement, sentenceGraph.tokens, set(["NP", "WHADVP", "WHNP"])) phraseDict = MapPhrases.getPhraseDict(phrases) phrases.extend( MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens, phraseDict, ["NP"])) phrases.extend( MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict)) phraseToEntity = MapPhrases.getPhraseEntityMapping( sentenceGraph.entities, phraseDict) # Make counts phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases) for key in phraseTypeCounts.keys(): if not self.phraseTypeCounts.has_key(key): self.phraseTypeCounts[key] = 0 self.phraseTypeCounts[key] += phraseTypeCounts[key] self.exampleStats.addVariable( "Phrase type counts", self.phraseTypeCounts ) # can be added on each loop, will always point to the same thing # Build one example for each phrase for phrase in phrases: features = {} self.triggerFeatureBuilder.setFeatureVector(features) categoryName = self.getCategoryName(phrase, phraseToEntity) category = self.classSet.getId(categoryName) phraseTokens = self.getPhraseTokens(phrase, sentenceGraph) phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens) self.exampleStats.beginExample(categoryName) if self.styles["co_limits"] and not self.isPotentialCOTrigger( phrase, phraseTokens, sentenceGraph): self.exampleStats.filter("co_limits") self.exampleStats.endExample() continue # Sentence level features features.update(self.triggerFeatureBuilder.bowFeatures) # Whole phrase features self.buildLinearNGram(phraseTokens, sentenceGraph, features) features[self.featureSet.getId("pType_" + phrase.get("type"))] = 1 for split in phrase.get("type").split("-"): features[self.featureSet.getId("pSubType_" + split)] = 1 # Check named entities nameCount = 0 for token in phraseTokens: if sentenceGraph.tokenIsName[token]: nameCount += 1 features[self.featureSet.getId("phraseNames_" + str(nameCount))] = 1 features[self.featureSet.getId("phraseNameCount")] = nameCount # Head token features self.triggerFeatureBuilder.setTag("head_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken) self.triggerFeatureBuilder.buildAttachedEdgeFeatures( phraseHeadToken, sentenceGraph) self.triggerFeatureBuilder.setTag() # Features for all phrase tokens self.triggerFeatureBuilder.setTag("ptok_") phraseTokenPos = 0 #print len(phraseTokens) for token in phraseTokens: self.triggerFeatureBuilder.setTag("ptok_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos) + "_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos - len(phraseTokens)) + "_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) #self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken) phraseTokenPos += 1 self.triggerFeatureBuilder.setTag() extra = { "xtype": "phrase", "t": phraseHeadToken.get("id"), "p": phrase.get("id"), "ptype": phrase.get("type") } extra["charOffset"] = phrase.get("charOffset") if phrase not in phraseToEntity: extra["eids"] = "neg" else: extra["eids"] = ",".join( [x.get("id") for x in phraseToEntity[phrase]]) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) self.exampleStats.endExample() exampleIndex += 1 # Mark missed entities in exampleStats linkedEntities = set(sum(phraseToEntity.values(), [])) for entity in sentenceGraph.entities: if entity.get("given") != "True" and entity not in linkedEntities: self.exampleStats.addValue("Entities with no phrase", 1) # Marking these as filtered examples was misleading, as examples are per phrase, and these are entities #self.exampleStats.beginExample(entity.get("type")) #self.exampleStats.filter("no_phrase") #self.exampleStats.endExample() return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange - 1): for j in range(i + 1, loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get( "source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([ self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([ self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_" + text): bagOfWords["spec_bow_" + text] = 0 bagOfWords["spec_bow_" + text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("isName") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][ 0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = { "xtype": "task3", "t3type": task3Type, "t": token.get("id"), "entity": entity.get("id") } #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) #examples = [] exampleIndex = 0 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths( sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) # Generate examples based on interactions between entities or interactions between tokens # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # for interaction in sentenceGraph.interactions: # if interaction.get("type") == "neg": # continue # e1Id = interaction.get("e1") # interactionsByEntityId[e1Id].append(interaction) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities exampleIndex = 0 for entity in entities: # sentenceGraph.entities: eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]: # continue #if not goldEntitiesByOffset.has_key(entity.get("headOffset")): # continue #interactions = interactionsByEntityId[entity.get("id")] interactions = [ x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput) ] argCombinations = self.getArgumentCombinations( eType, interactions, entity.get("id")) #if len(argCombinations) <= 1: # continue assert argCombinations != None, (entity.get("id"), entity.get("type")) for argCombination in argCombinations: if eType != "Process": assert len(argCombination ) > 0, eType + ": " + str(argCombinations) # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: #category = "event" category = eType if category.find("egulation") != -1: category = "All_regulation" elif category != "Binding": category = "Other" #"simple6" else: category = "neg" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("id") extra = { "xtype": "um", "e": entity.get("id"), "i": argString[1:], "etype": eType, "class": category } assert type(extra["etype"]) == types.StringType, extra self.exampleStats.addExample(category) example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId() + ".x" + str( exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each phrase in the sentence """ self.triggerFeatureBuilder.initSentence(sentenceGraph) # examples = [] exampleIndex = 0 # Prepare phrases, create subphrases # filter = set(["NP", "TOK-IN", "WHADVP", "WHNP", "TOK-WP$", "TOK-PRP$", "NP-IN"]) phrases = MapPhrases.getPhrases(sentenceGraph.parseElement, sentenceGraph.tokens, set(["NP", "WHADVP", "WHNP"])) phraseDict = MapPhrases.getPhraseDict(phrases) phrases.extend(MapPhrases.makeINSubPhrases(phrases, sentenceGraph.tokens, phraseDict, ["NP"])) phrases.extend(MapPhrases.makeTokenSubPhrases(sentenceGraph.tokens, phraseDict)) phraseToEntity = MapPhrases.getPhraseEntityMapping(sentenceGraph.entities, phraseDict) # Make counts phraseTypeCounts = MapPhrases.getPhraseTypeCounts(phrases) for key in phraseTypeCounts.keys(): if not self.phraseTypeCounts.has_key(key): self.phraseTypeCounts[key] = 0 self.phraseTypeCounts[key] += phraseTypeCounts[key] self.exampleStats.addVariable( "Phrase type counts", self.phraseTypeCounts ) # can be added on each loop, will always point to the same thing # Build one example for each phrase for phrase in phrases: features = {} self.triggerFeatureBuilder.setFeatureVector(features) categoryName = self.getCategoryName(phrase, phraseToEntity) category = self.classSet.getId(categoryName) phraseTokens = self.getPhraseTokens(phrase, sentenceGraph) phraseHeadToken = self.getPhraseHeadToken(phrase, phraseTokens) self.exampleStats.beginExample(categoryName) if self.styles["co_limits"] and not self.isPotentialCOTrigger(phrase, phraseTokens, sentenceGraph): self.exampleStats.filter("co_limits") self.exampleStats.endExample() continue # Sentence level features features.update(self.triggerFeatureBuilder.bowFeatures) # Whole phrase features self.buildLinearNGram(phraseTokens, sentenceGraph, features) features[self.featureSet.getId("pType_" + phrase.get("type"))] = 1 for split in phrase.get("type").split("-"): features[self.featureSet.getId("pSubType_" + split)] = 1 # Check named entities nameCount = 0 for token in phraseTokens: if sentenceGraph.tokenIsName[token]: nameCount += 1 features[self.featureSet.getId("phraseNames_" + str(nameCount))] = 1 features[self.featureSet.getId("phraseNameCount")] = nameCount # Head token features self.triggerFeatureBuilder.setTag("head_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken) self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken, sentenceGraph) self.triggerFeatureBuilder.setTag() # Features for all phrase tokens self.triggerFeatureBuilder.setTag("ptok_") phraseTokenPos = 0 # print len(phraseTokens) for token in phraseTokens: self.triggerFeatureBuilder.setTag("ptok_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos) + "_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) self.triggerFeatureBuilder.setTag("ptok_" + str(phraseTokenPos - len(phraseTokens)) + "_") self.triggerFeatureBuilder.buildFeatures(phraseHeadToken, linear=False, chains=False) # self.triggerFeatureBuilder.buildAttachedEdgeFeatures(phraseHeadToken) phraseTokenPos += 1 self.triggerFeatureBuilder.setTag() extra = { "xtype": "phrase", "t": phraseHeadToken.get("id"), "p": phrase.get("id"), "ptype": phrase.get("type"), } extra["charOffset"] = phrase.get("charOffset") if phrase not in phraseToEntity: extra["eids"] = "neg" else: extra["eids"] = ",".join([x.get("id") for x in phraseToEntity[phrase]]) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) self.exampleStats.endExample() exampleIndex += 1 # Mark missed entities in exampleStats linkedEntities = set(sum(phraseToEntity.values(), [])) for entity in sentenceGraph.entities: if entity.get("given") != "True" and entity not in linkedEntities: self.exampleStats.addValue("Entities with no phrase", 1) # Marking these as filtered examples was misleading, as examples are per phrase, and these are entities # self.exampleStats.beginExample(entity.get("type")) # self.exampleStats.filter("no_phrase") # self.exampleStats.endExample() return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) #examples = [] exampleIndex = 0 #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) # Generate examples based on interactions between entities or interactions between tokens # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # for interaction in sentenceGraph.interactions: # if interaction.get("type") == "neg": # continue # e1Id = interaction.get("e1") # interactionsByEntityId[e1Id].append(interaction) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities exampleIndex = 0 for entity in entities: # sentenceGraph.entities: eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) #if eType not in ["Binding", "Positive_regulation", "Negative_regulation", "Regulation"]: # continue #if not goldEntitiesByOffset.has_key(entity.get("headOffset")): # continue #interactions = interactionsByEntityId[entity.get("id")] interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) #if len(argCombinations) <= 1: # continue assert argCombinations != None, (entity.get("id"), entity.get("type")) for argCombination in argCombinations: if eType != "Process": assert len(argCombination) > 0, eType + ": " + str(argCombinations) # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: #category = "event" category = eType if category.find("egulation") != -1: category = "All_regulation" elif category != "Binding": category = "Other" #"simple6" else: category = "neg" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} assert type(extra["etype"]) == types.StringType, extra self.exampleStats.addExample(category) example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped buildForNameless = False if structureAnalyzer and not structureAnalyzer.hasGroupClass("GIVEN", "ENTITY"): # no given entities points to no separate NER program being used buildForNameless = True if self.styles["build_for_nameless"]: # manually force the setting buildForNameless = True if self.styles["skip_for_nameless"]: # manually force the setting buildForNameless = False # determine whether sentences with no given entities should be skipped namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) else: for key in sentenceGraph.tokenIsName.keys(): sentenceGraph.tokenIsName[key] = False bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) if category == None: self.exampleStats.filter("undefined_class") self.exampleStats.endExample() continue tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_"+normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_"+norStem)] = 1 features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_"+stringLower)] = 1 features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1 # Linear order features for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1 features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) # DDI13 features if self.styles["ddi13_features"]: for index in range(len(normalizedText)): features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index+1])] = 1 features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildDrugFeatures(token) self.drugFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_"+wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) extra = {"xtype":"token","t":token.get("id")} if self.styles["bb_features"]: extra["trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra["goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # chains self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ examples = [] exampleIndex = 0 self.tokenFeatures = {} if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(sentenceGraph.entities, goldGraph.entities) namedEntityCount = 0 entityCount = 0 for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features namedEntityCount += 1 else: # known data which can be used for features entityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) entityCountFeature = "entityCount_" + str(entityCount) bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if len(sentenceGraph.tokenIsEntityHead) > 0: text = "ge_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 text = token.get("text") if self.styles["speculation_words"] and text in self.specWords: if not bagOfWords.has_key("spec_bow_"+text): bagOfWords["spec_bow_"+text] = 0 bagOfWords["spec_bow_"+text] += 1 bagOfWords["spec_sentence"] = 1 bowFeatures = {} for k,v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.getInEdges(token) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for entity in sentenceGraph.entities: #token = sentenceGraph.tokens[i] token = sentenceGraph.entityHeadTokenByEntity[entity] # Recognize only non-named entities (i.e. interaction words) if entity.get("given") == "True": continue # CLASS if self.styles["classification"] == "multiclass": task3Type = "multiclass" categoryName = "" if entity.get("negation") == "True": categoryName += "negation" if entity.get("speculation") == "True": if categoryName != "": categoryName += "---" categoryName += "speculation" if categoryName == "": categoryName = "neg" category = self.classSet.getId(categoryName) elif self.styles["classification"] == "speculation": task3Type = "speculation" if entity.get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("speculation") == "True": category = self.classSet.getId("speculation") else: category = 1 categoryName = self.classSet.getName(category) elif self.styles["classification"] == "negation": task3Type = "negation" if entity.get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 if goldGraph != None: if len(entityToGold[entity]) > 0 and entityToGold[entity][0].get("negation") == "True": category = self.classSet.getId("negation") else: category = 1 categoryName = self.classSet.getName(category) self.exampleStats.beginExample(categoryName) # FEATURES features = {} # ENTITY TYPE #entityType = self.classSet.getId(self.getMergedEntityType(entity)) #del self.classSet.Ids[self.getMergedEntityType(entity)] #IF LOCAL # There's a mistake here. The entityType should be the string, not # the id of the type. But there's also another issue. getMergedEntityType # expects a list, not an item. Therefore the type is always empty -> # types don't get used in classification. But this is the code used in # the publication, so it will now be published as is, and fixed in a later # release. # # Besides, using the classSet here generates an unneeded # additional class, that shows up in evaluations etc. However, to be # able to publish the exact models used for the publication experiments, # this can't be fixed so it breaks feature id consistency. Therefore I'll # now just remove the redundant class id from the classSet. #ENDIF #features[self.featureSet.getId(entityType)] = 1 features[self.featureSet.getId(namedEntityCountFeature)] = 1 features[self.featureSet.getId(entityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 if self.styles["speculation_words"]: if text in self.specWords: features[self.featureSet.getId("ent_spec")] = 1 if stem in self.specWordStems: features[self.featureSet.getId("ent_spec_stem")] = 1 # Linear order features for i in range(len(sentenceGraph.tokens)): if token == sentenceGraph.tokens[i]: break for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 self.buildChains(token, sentenceGraph, features) extra = {"xtype":"task3","t3type":task3Type,"t":token.get("id"),"entity":entity.get("id")} #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues): for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues else: if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif structureAnalyzer.isValidEntity(entity) or structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, noUpperLimitBeyondOne=self.styles["no_arg_count_upper_limit"], issues=issues): if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 else: # not a valid event or valid entity if len(issues) == 0: # must be > 0 so that it gets filtered if not structureAnalyzer.isValidEntity(entity): issues["INVALID_ENTITY:"+eType] += 1 else: issues["UNKNOWN_ISSUE_FOR:"+eType] += 1 for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues self.exampleStats.endExample() #return examples return exampleIndex