def processSentence(self, sentenceGraph): #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.multiEdgeFeatureBuilder.setFeatureVector() positivePaths = { } # per sentence, a path may still be negative in another sentence for interaction in sentenceGraph.interactions: e1 = sentenceGraph.entitiesById[interaction.get("e1")] e1Token = sentenceGraph.entityHeadTokenByEntity[e1] e2 = sentenceGraph.entitiesById[interaction.get("e2")] e2Token = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token): if not positivePaths.has_key(e1Token): positivePaths[e1Token] = {} positivePaths[e1Token][e2Token] = True path = paths[e1Token][e2Token] for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations( sentenceGraph.dependencyGraph, path): if not self.gazetteer.has_key(comb): self.gazetteer[comb] = [None, None, 0, 0] self.gazetteer[comb][2] += 1 if self.includeNeg: for t1 in sentenceGraph.tokens: for t2 in sentenceGraph.tokens: if t1 == t2: continue if positivePaths.has_key(t1) and positivePaths[t1].has_key( t2): continue if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations( sentenceGraph.dependencyGraph, path): if not self.gazetteer.has_key(comb): self.gazetteer[comb] = [None, None, 0, 0] self.gazetteer[comb][3] += 1
def processSentence(self, sentenceGraph): #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.multiEdgeFeatureBuilder.setFeatureVector() positivePaths = {} # per sentence, a path may still be negative in another sentence for interaction in sentenceGraph.interactions: e1 = sentenceGraph.entitiesById[interaction.get("e1")] e1Token = sentenceGraph.entityHeadTokenByEntity[e1] e2 = sentenceGraph.entitiesById[interaction.get("e2")] e2Token = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(e1Token) and paths[e1Token].has_key(e2Token): if not positivePaths.has_key(e1Token): positivePaths[e1Token] = {} positivePaths[e1Token][e2Token] = True path = paths[e1Token][e2Token] for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path): if not self.gazetteer.has_key(comb): self.gazetteer[comb] = [None,None,0,0] self.gazetteer[comb][2] += 1 if self.includeNeg: for t1 in sentenceGraph.tokens: for t2 in sentenceGraph.tokens: if t1 == t2: continue if positivePaths.has_key(t1) and positivePaths[t1].has_key(t2): continue if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] for comb in self.multiEdgeFeatureBuilder.getEdgeCombinations(sentenceGraph.dependencyGraph, path): if not self.gazetteer.has_key(comb): self.gazetteer[comb] = [None,None,0,0] self.gazetteer[comb][3] += 1
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get("isName") == "True": # Names can never have duplicates entities.append( (entity, 0, False) ) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities)-1): for j in range(i+1,len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j]) ) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction(eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i]) ) exampleIndex += 1 return examples
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i, loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac( (tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles ) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles ) and not self.isPotentialBioInferInteraction( eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac( (tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI)) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples
def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths( sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get( "charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions #interactionsByEntityId = {} #for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[ sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} #namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "isName" ) == "True": # known data which can be used for features namedEntityCount += 1 #namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) #if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) #neFeatures = {} # F: 69.35 -> 69.14 #for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[ token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS #if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) #else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId( self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles ) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = { "xtype": "token", "t": token.get("id"), "excluded": "True" } examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) #features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId( "gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features #for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append( (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples
def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) examples = [] exampleIndex = 0 #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) eventTokens = [] nameTokens = [] gazCategories = {None: {"neg": -1}} #stems = {} for token in sentenceGraph.tokens: gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories[token] = self.gazetteer[gazText] else: gazCategories[token] = {"neg": -1} if token.get("id") in self.namedEntityHeadTokenIds: nameTokens.append(token) elif gazText != None: eventTokens.append(token) allTokens = eventTokens + nameTokens #if len(nameTokens) == 0: # there can be no events in this sentence # self.gsEvents = None # return [] for token in eventTokens: #gazCategories = self.gazetteer[token.get("text").lower()] #print token.get("text").lower(), gazCategories #multiargument = False potentialRegulation = False potentialBinding = False for key in gazCategories[token].keys(): if key in [ "Regulation", "Positive_regulation", "Negative_regulation" ]: #multiargument = True potentialRegulation = True break for key in gazCategories[token].keys(): if key in ["Binding"]: #multiargument = True potentialBinding = True break if potentialRegulation: combinations = combine.combine(allTokens, allTokens + [None]) else: combinations = [] for t2 in nameTokens: #allTokens: combinations.append((t2, None)) if potentialBinding: for i in range(len(nameTokens) - 1): for j in range(i + 1, len(nameTokens)): combinations.append( ((nameTokens[i], nameTokens[j]), None)) for combination in combinations: theme2Binding = False if type(combination[0]) == types.ListType or type( combination[0]) == types.TupleType: theme2Binding = True categoryName, eventIds = self.getGSEventType( sentenceGraph, token, combination[0], [combination[1]]) else: categoryName, eventIds = self.getGSEventType( sentenceGraph, token, [combination[0]], [combination[1]]) for id in eventIds: self.examplesByEventOrigId[id] += 1 skip = False s = self.skippedByTypeAndReason if not s.has_key(categoryName): s[categoryName] = {} if gazCategories[token].get("neg", -1) > 0.99: pass if combination[0] == combination[1]: pass #skip = True if combination[0] == token or combination[1] == token: if theme2Binding or gazCategories[combination[0]].get( "Positive_regulation", -1) < 0: skip = True s[categoryName]["duparg"] = s[categoryName].get( "duparg", 0) + 1 if combination[0] == None and combination[1] == None: skip = True s[categoryName]["noncmb"] = s[categoryName].get( "noncmb", 0) + 1 validCat = self.isValidEvent(paths, sentenceGraph, token, combination) if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination): skip = True #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1 s[categoryName][validCat] = s[categoryName].get( validCat, 0) + 1 if len(nameTokens) == 0: skip = True s[categoryName]["non"] = s[categoryName].get("non", 0) + 1 if theme2Binding: if gazCategories[combination[0][0]].get( "neg", -1) > 0.99 or gazCategories[combination[0][1]].get( "neg", -1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get( "gazarg", 0) + 1 else: if gazCategories[combination[0]].get( "neg", -1) > 0.99 or gazCategories[combination[1]].get( "neg", -1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get( "gazarg", 0) + 1 if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"): self.skippedByType[categoryName] = self.skippedByType.get( categoryName, 0) + 1 else: if self.negFrac == None or categoryName != "neg" or ( categoryName == "neg" and self.negRand.random() < self.negFrac): self.builtByType[categoryName] = self.builtByType.get( categoryName, 0) + 1 if theme2Binding: newExample = self.buildExample( exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]]) else: newExample = self.buildExample( exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]]) if len(eventIds) > 0: newExample[3]["numEv"] = str(len(eventIds)) examples.append(newExample) exampleIndex += 1 self.gsEvents = None return examples
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected( sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Determine overlapping entity precedence #levelByEntity, levelByInteraction = self.getPrecedenceLevels(sentenceGraph, paths) levelByEntity = self.getPrecedenceLevels(sentenceGraph, paths) entities = [] # There is one entity group for each token, for each type of entity for token in sentenceGraph.tokens: # per token entitiesByType = {} for entity in sentenceGraph.tokenIsEntityHead[token]: # per type if entity.get( "isName") == "True": # Names can never have duplicates entities.append((entity, 0, False)) continue eType = entity.get("type") if eType == "neg": continue if not entitiesByType.has_key(eType): entitiesByType[eType] = [] entitiesByType[eType].append(entity) # Create slot groups for tokens for which exists at least one entity eTypes = sorted(entitiesByType.keys()) if len(eTypes) == 0: continue # Create slot groups and insert GS data there for eType in eTypes: # Use first entity of a type as the dummy entity for unfilled slots dummyEntity = entitiesByType[eType][0] # Define entity slots entityGroup = [None, None, None, None] #entityGroup = [None, None] # Insert existing entities into slots for entity in entitiesByType[eType]: if levelByEntity.has_key(entity): level = levelByEntity[entity] if level < len(entityGroup): entityGroup[level] = (entity, level, False) # Create dummies for potential entities for i in range(len(entityGroup)): if entityGroup[i] == None: entityGroup[i] = (dummyEntity, i, True) # Put all slots into one potential entity list #print entityGroup for e in entityGroup: entities.append(e) # Generate examples based on interactions between entities for i in range(len(entities) - 1): for j in range(i + 1, len(entities)): eI = entities[i][0] eJ = entities[j][0] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] # define forward example categoryName = self.getCategoryName(sentenceGraph, entities[i], entities[j], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eI, eJ): examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, entities[i], entities[j])) exampleIndex += 1 # define reverse categoryName = self.getCategoryName(sentenceGraph, entities[j], entities[i], True) if (not "genia_limits" in self.styles) or self.isPotentialGeniaInteraction( eJ, eI): examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, entities[j], entities[i])) exampleIndex += 1 return examples
def analyzeLengths(corpusElements): interactionEdges = 0 dependencyEdges = 0 pathsByLength = {} pathsBetweenAllEntitiesByLength = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph #interactionEdges += len(sentenceGraph.interactionGraph.edges()) interactionEdges += len(sentence.interactions) dependencyEdges += len(sentenceGraph.dependencyGraph.edges()) undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Shortest path for interaction edge for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] if not pathsByLength.has_key(len(path) - 1): pathsByLength[len(path) - 1] = 0 pathsByLength[len(path) - 1] += 1 else: if not pathsByLength.has_key("none"): pathsByLength["none"] = 0 pathsByLength["none"] += 1 # for intEdge in sentenceGraph.interactionGraph.edges(): # if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]): # path = paths[intEdge[0]][intEdge[1]] # if not pathsByLength.has_key(len(path)-1): # pathsByLength[len(path)-1] = 0 # pathsByLength[len(path)-1] += 1 # else: # if not pathsByLength.has_key("none"): # pathsByLength["none"] = 0 # pathsByLength["none"] += 1 # Shortest paths between all entities for i in range(len(sentence.entities) - 1): for j in range(i + 1, len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[j]] if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] if not pathsBetweenAllEntitiesByLength.has_key( len(path) - 1): pathsBetweenAllEntitiesByLength[len(path) - 1] = 0 pathsBetweenAllEntitiesByLength[len(path) - 1] += 1 elif tI == tJ: if not pathsBetweenAllEntitiesByLength.has_key(0): pathsBetweenAllEntitiesByLength[0] = 0 pathsBetweenAllEntitiesByLength[0] += 1 else: if not pathsBetweenAllEntitiesByLength.has_key("none"): pathsBetweenAllEntitiesByLength["none"] = 0 pathsBetweenAllEntitiesByLength["none"] += 1 # for i in range(len(sentenceGraph.tokens)-1): # for j in range(i+1,len(sentenceGraph.tokens)): # tI = sentenceGraph.tokens[i] # tJ = sentenceGraph.tokens[j] # if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None: # continue # if paths.has_key(tI) and paths[tI].has_key(tJ): # path = paths[tI][tJ] # if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): # pathsBetweenAllEntitiesByLength[len(path)-1] = 0 # pathsBetweenAllEntitiesByLength[len(path)-1] += 1 # else: # if not pathsBetweenAllEntitiesByLength.has_key("none"): # pathsBetweenAllEntitiesByLength["none"] = 0 # pathsBetweenAllEntitiesByLength["none"] += 1 print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Dependency edges:", dependencyEdges print >> sys.stderr, "Shortest path of dependencies for interaction edge:" printPathDistribution(pathsByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsByLength, options.output + "/pathsByLength.csv") print >> sys.stderr, "Shortest path of dependencies between all entities:" printPathDistribution(pathsBetweenAllEntitiesByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV( pathsBetweenAllEntitiesByLength, options.output + "/pathsBetweenAllEntitiesByLength.csv")
def buildExamples(self, sentenceGraph): self.makeGSEvents(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) examples = [] exampleIndex = 0 #undirected = sentenceGraph.dependencyGraph.to_undirected() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) eventTokens = [] nameTokens = [] gazCategories = {None:{"neg":-1}} #stems = {} for token in sentenceGraph.tokens: gazText = self.getGazetteerMatch(token.get("text").lower()) if gazText != None: gazCategories[token] = self.gazetteer[gazText] else: gazCategories[token] = {"neg":-1} if token.get("id") in self.namedEntityHeadTokenIds: nameTokens.append(token) elif gazText != None: eventTokens.append(token) allTokens = eventTokens + nameTokens #if len(nameTokens) == 0: # there can be no events in this sentence # self.gsEvents = None # return [] for token in eventTokens: #gazCategories = self.gazetteer[token.get("text").lower()] #print token.get("text").lower(), gazCategories #multiargument = False potentialRegulation = False potentialBinding = False for key in gazCategories[token].keys(): if key in ["Regulation","Positive_regulation","Negative_regulation"]: #multiargument = True potentialRegulation = True break for key in gazCategories[token].keys(): if key in ["Binding"]: #multiargument = True potentialBinding = True break if potentialRegulation: combinations = combine.combine(allTokens, allTokens+[None]) else: combinations = [] for t2 in nameTokens: #allTokens: combinations.append( (t2, None) ) if potentialBinding: for i in range(len(nameTokens) - 1): for j in range(i+1, len(nameTokens)): combinations.append( ((nameTokens[i],nameTokens[j]), None) ) for combination in combinations: theme2Binding = False if type(combination[0]) == types.ListType or type(combination[0]) == types.TupleType: theme2Binding = True categoryName, eventIds = self.getGSEventType(sentenceGraph, token, combination[0], [combination[1]]) else: categoryName, eventIds = self.getGSEventType(sentenceGraph, token, [combination[0]], [combination[1]]) for id in eventIds: self.examplesByEventOrigId[id] += 1 skip = False s = self.skippedByTypeAndReason if not s.has_key(categoryName): s[categoryName] = {} if gazCategories[token].get("neg",-1) > 0.99: pass if combination[0] == combination[1]: pass #skip = True if combination[0] == token or combination[1] == token: if theme2Binding or gazCategories[combination[0]].get("Positive_regulation",-1) < 0: skip = True s[categoryName]["duparg"] = s[categoryName].get("duparg", 0) + 1 if combination[0] == None and combination[1] == None: skip = True s[categoryName]["noncmb"] = s[categoryName].get("noncmb", 0) + 1 validCat = self.isValidEvent(paths, sentenceGraph, token, combination) if validCat != "OK": #not self.isValidEvent(paths, sentenceGraph, token, combination): skip = True #s[categoryName]["valid"] = s[categoryName].get("valid", 0) + 1 s[categoryName][validCat] = s[categoryName].get(validCat, 0) + 1 if len(nameTokens) == 0: skip = True s[categoryName]["non"] = s[categoryName].get("non", 0) + 1 if theme2Binding: if gazCategories[combination[0][0]].get("neg",-1) > 0.99 or gazCategories[combination[0][1]].get("neg",-1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1 else: if gazCategories[combination[0]].get("neg",-1) > 0.99 or gazCategories[combination[1]].get("neg",-1) > 0.99: skip = True s[categoryName]["gazarg"] = s[categoryName].get("gazarg", 0) + 1 if (skip and self.negFrac == None) or (skip and self.negFrac != None and categoryName == "neg"): self.skippedByType[categoryName] = self.skippedByType.get(categoryName, 0) + 1 else: if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): self.builtByType[categoryName] = self.builtByType.get(categoryName, 0) + 1 if theme2Binding: newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, combination[0], [combination[1]]) else: newExample = self.buildExample(exampleIndex, sentenceGraph, paths, token, [combination[0]], [combination[1]]) if len(eventIds) > 0: newExample[3]["numEv"] = str(len(eventIds)) examples.append( newExample ) exampleIndex += 1 self.gsEvents = None return examples
def analyzeLengths(corpusElements): interactionEdges = 0 dependencyEdges = 0 pathsByLength = {} pathsBetweenAllEntitiesByLength = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph #interactionEdges += len(sentenceGraph.interactionGraph.edges()) interactionEdges += len(sentence.interactions) dependencyEdges += len(sentenceGraph.dependencyGraph.edges()) undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Shortest path for interaction edge for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] if not pathsByLength.has_key(len(path)-1): pathsByLength[len(path)-1] = 0 pathsByLength[len(path)-1] += 1 else: if not pathsByLength.has_key("none"): pathsByLength["none"] = 0 pathsByLength["none"] += 1 # for intEdge in sentenceGraph.interactionGraph.edges(): # if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]): # path = paths[intEdge[0]][intEdge[1]] # if not pathsByLength.has_key(len(path)-1): # pathsByLength[len(path)-1] = 0 # pathsByLength[len(path)-1] += 1 # else: # if not pathsByLength.has_key("none"): # pathsByLength["none"] = 0 # pathsByLength["none"] += 1 # Shortest paths between all entities for i in range(len(sentence.entities)-1): for j in range(i+1,len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]] if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): pathsBetweenAllEntitiesByLength[len(path)-1] = 0 pathsBetweenAllEntitiesByLength[len(path)-1] += 1 elif tI == tJ: if not pathsBetweenAllEntitiesByLength.has_key(0): pathsBetweenAllEntitiesByLength[0] = 0 pathsBetweenAllEntitiesByLength[0] += 1 else: if not pathsBetweenAllEntitiesByLength.has_key("none"): pathsBetweenAllEntitiesByLength["none"] = 0 pathsBetweenAllEntitiesByLength["none"] += 1 # for i in range(len(sentenceGraph.tokens)-1): # for j in range(i+1,len(sentenceGraph.tokens)): # tI = sentenceGraph.tokens[i] # tJ = sentenceGraph.tokens[j] # if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None: # continue # if paths.has_key(tI) and paths[tI].has_key(tJ): # path = paths[tI][tJ] # if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): # pathsBetweenAllEntitiesByLength[len(path)-1] = 0 # pathsBetweenAllEntitiesByLength[len(path)-1] += 1 # else: # if not pathsBetweenAllEntitiesByLength.has_key("none"): # pathsBetweenAllEntitiesByLength["none"] = 0 # pathsBetweenAllEntitiesByLength["none"] += 1 print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Dependency edges:", dependencyEdges print >> sys.stderr, "Shortest path of dependencies for interaction edge:" printPathDistribution(pathsByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsByLength, options.output+"/pathsByLength.csv") print >> sys.stderr, "Shortest path of dependencies between all entities:" printPathDistribution(pathsBetweenAllEntitiesByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsBetweenAllEntitiesByLength, options.output+"/pathsBetweenAllEntitiesByLength.csv")
def buildExamples(self, sentenceGraph): examples = [] exampleIndex = 0 clearGraph = sentenceGraph.getCleared() #undirected = sentenceGraph.getUndirectedDependencyGraph() undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ##undirected = sentenceGraph.dependencyGraph.to_undirected() ###undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) self.triggerFeatureBuilder.initSentence(clearGraph) # Generate examples based on interactions between entities or interactions between tokens if "entities" in self.styles: loopRange = len(sentenceGraph.entities) else: loopRange = len(sentenceGraph.tokens) #for i in range(loopRange-1): for i in range(loopRange): # allow self-interactions #for j in range(i+1,loopRange): for j in range(i,loopRange): # allow self-interactions eI = None eJ = None if "entities" in self.styles: eI = sentenceGraph.entities[i] eJ = sentenceGraph.entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # # only consider paths between entities (NOTE! entities, not only named entities) # if "headsOnly" in self.styles: # if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): # continue if "directed" in self.styles: # define forward if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.posPairGaz.getNegFrac((tI.get("POS"), tJ.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tI]: examples.append( self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # define reverse if "entities" in self.styles: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) self.exampleStats.beginExample(categoryName) if self.negFrac == None or categoryName != "neg" or (categoryName == "neg" and self.negRand.random() < self.negFrac): makeExample = True if ("genia_limits" in self.styles) and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if ("bioinfer_limits" in self.styles) and not self.isPotentialBioInferInteraction(eJ, eI, categoryName): makeExample = False self.exampleStats.filter("bioinfer_limits") if self.posPairGaz.getNegFrac((tJ.get("POS"), tI.get("POS"))) == 1.0: makeExample = False self.exampleStats.filter("pos_pair") if makeExample: if not sentenceGraph.tokenIsName[tJ]: examples.append( self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) ) exampleIndex += 1 else: self.exampleStats.filter("genia_token_limits") else: self.exampleStats.filter("neg_frac") self.exampleStats.endExample() # else: # if "entities" in self.styles: # categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) # else: # categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) # forwardExample = self.buildExample(tI, tJ, paths, clearGraph, categoryName, exampleIndex, eI, eJ) # if not "graph_kernel" in self.styles: # reverseExample = self.buildExample(tJ, tI, paths, clearGraph, categoryName, exampleIndex, eJ, eI) # forwardExample[2].update(reverseExample[2]) # examples.append(forwardExample) # exampleIndex += 1 return examples
def buildExamplesInner(self, sentenceGraph, goldGraph): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >>sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return [] self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Get argument order self.interactionLengths = self.getInteractionEdgeLengths(sentenceGraph, paths) self.interactionLengths = self.interactionLengths.values() self.interactionLengths.sort(compareInteractionPrecedence) # Map tokens to entities tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} for token in sentenceGraph.tokens: goldEntitiesByOffset[token.get("charOffset")] = [] entityToGold = {} for entity in sentenceGraph.entities: entityToGold[entity] = [] if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None goldEntitiesByOffset[offset].append(entity) # Map predicted entities to gold entities for entity in sentenceGraph.entities: eType = entity.get("type") eOffset = entity.get("headOffset") for goldEntity in goldEntitiesByOffset[eOffset]: if goldEntity.get("type") == eType: entityToGold[entity].append(goldEntity) # Map entities to interactions # interactionsByEntityId = {} # for entity in sentenceGraph.entities: # interactionsByEntityId[entity.get("id")] = [] # Map tokens to interactions interactionsByToken = {} for token in sentenceGraph.tokens: interactionsByToken[token] = [] for interactionTuple in self.interactionLengths: interaction = interactionTuple[0] if interaction.get("type") == "neg": continue e1Id = interaction.get("e1") token = sentenceGraph.entityHeadTokenByEntity[sentenceGraph.entitiesById[e1Id]] interactionsByToken[token].append(interaction) examples = [] exampleIndex = 0 self.tokenFeatures = {} # namedEntityNorStrings = set() namedEntityHeadTokens = [] if not "names" in self.styles: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("isName") == "True": # known data which can be used for features namedEntityCount += 1 # namedEntityNorStrings.add( entity.get("text").replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() ) namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # if namedEntityCount == 0: # no names, no need for triggers # return [] if "pos_pairs" in self.styles: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) # neFeatures = {} # F: 69.35 -> 69.14 # for norString in namedEntityNorStrings: # neFeatures[self.featureSet.getId("norNE_" + norString)] = 1 bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k, v in bagOfWords.iteritems(): bowFeatures[self.featureSet.getId(k)] = v self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) fixedInEdges = [] for edge in inEdges: fixedInEdges.append((edge[0], edge[1], edge[2]["element"])) inEdges = fixedInEdges inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) fixedOutEdges = [] for edge in outEdges: fixedOutEdges.append((edge[0], edge[1], edge[2]["element"])) outEdges = fixedOutEdges outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not "names" in self.styles and not "all_tokens" in self.styles: continue # CLASS # if len(sentenceGraph.tokenIsEntityHead[token]) > 0: # category = self.classSet.getId(self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token])) # else: # category = 1 offset = token.get("charOffset") if len(goldEntitiesByOffset[offset]) > 0: category = self.classSet.getId(self.getMergedEntityType(goldEntitiesByOffset[offset])) else: category = 1 tokenText = token.get("text").lower() if "stem_gazetteer" in self.styles: tokenText = PorterStemmer.stem(tokenText) if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: features = {} features[self.featureSet.getId("exclude_gazetteer")] = 1 extra = {"xtype": "token", "t": token.get("id"), "excluded": "True"} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 continue # FEATURES features = {} self.features = features if not "names" in self.styles: features[self.featureSet.getId(namedEntityCountFeature)] = 1 # for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # features.update(neFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem) :])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = ( text.replace("-", "").replace("/", "").replace(",", "").replace("\\", "").replace(" ", "").lower() ) if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem) :])] = 1 if "gazetteer_features_maintoken" in self.styles: tokTxtLower = text.lower() if "stem_gazetteer" in self.styles: tokTxtLower = PorterStemmer.stem(tokTxtLower) if self.gazetteer and tokTxtLower in self.gazetteer: for label, weight in self.gazetteer[tokTxtLower].items(): features[self.featureSet.getId("gaz_knownLabel_" + label)] = weight # 1 performs slightly worse # Linear order features # for index in [-3,-2,-1,1,2,3,4,5]: # 69.35 -> 68.97 for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1 : j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2 : j + 1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 extra = {"xtype": "token", "t": token.get("id")} examples.append((sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)) exampleIndex += 1 # chains self.buildChains(token, sentenceGraph, features) if "pos_pairs" in self.styles: self.buildPOSPairs(token, namedEntityHeadTokens, features) self.buildPredictionFeatures(sentenceGraph, paths, token, interactionsByToken[token]) return examples