class MultiEdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert(self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() # if sentenceGraph.interactionGraph.has_edge(t1, t2): # intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if (not directed): intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None): """ Example class. Multiple overlapping edges create a merged type. """ # interactions = [] # e1s = [e1] # if duplicateEntities != None and e1 in duplicateEntities: # e1s += duplicateEntities[e1] # e2s = [e2] # if duplicateEntities != None and e2 in duplicateEntities: # e2s += duplicateEntities[e2] # for entity1 in e1s: # for entity2 in e2s: # interactions = interactions + sentenceGraph.getInteractions(entity1, entity2) # if not directed: # interactions = interactions + sentenceGraph.getInteractions(entity2, entity1) interactions = sentenceGraph.getInteractions(e1, e2, True) #print interactions types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def isPotentialRELInteraction(self, e1, e2): if e1.get("type") == "Protein" and e2.get("type") == "Entity": return True else: return False def isPotentialBBInteraction(self, e1, e2, sentenceGraph): #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]: # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water"]: return True elif e1.get("type") == "Host" and e2.get("type") == "HostPart": return True else: return False def getBISuperType(self, eType): if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: return "ProteinEntity" elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: return "GeneEntity" else: return None def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats): e1Type = e1.get("type") e1SuperType = self.getBISuperType(e1Type) e2Type = e2.get("type") e2SuperType = self.getBISuperType(e2Type) tag = "(" + e1Type + "/" + e2Type + ")" if e1Type == "Regulon": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType == "ProteinEntity": if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]: return True if e1Type in ["Action", "Transcription", "Expression"]: return True if e1Type == "Site": if e2SuperType == "GeneEntity": return True if e1Type == "Promoter": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType in ["GeneEntity", "ProteinEntity"]: if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True stats.filter("bi_limits") #+tag) return False def isPotentialEPIInteraction(self, e1, e2, sentenceGraph): if e1.get("type") != "Catalysis": if e1.get("type") in ["Protein", "Entity"]: return False elif e2.get("type") in ["Protein", "Entity"]: return True else: return False else: # Catalysis if e2.get("type") != "Entity": return True else: return False assert False, (e1.get("type"), e2.get("type")) def isPotentialIDInteraction(self, e1, e2, sentenceGraph): e1Type = e1.get("type") e2Type = e2.get("type") e1IsCore = e1Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] e2IsCore = e2Type in ["Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism"] if e1IsCore: return False elif e1Type in ["Gene_expression", "Transcription"]: if e2Type in ["Protein", "Regulon-operon"]: return True else: return False elif e1Type in ["Protein_catabolism", "Phosphorylation"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2IsCore or e2Type == "Entity": return True else: return False elif e1Type in ["Binding", "Process"]: if e2IsCore: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False elif e1Type == "Entity": if e2IsCore: return True else: return False assert False, (e1Type, e2Type) def isPotentialCOInteraction(self, e1, e2, sentenceGraph): if e1.get("type") == "Exp" and e2.get("type") == "Exp": anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1] antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2] antecedentTokenFound = False for token in sentenceGraph.tokens: if token == antecedentTok: antecedentTokenFound = True if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok if antecedentTokenFound: return True else: return False assert False elif e1.get("type") == "Exp" and e2.get("type") == "Protein": return True else: return False def isPotentialGeniaInteraction(self, e1, e2): e1Type = e1.get("type") e2Type = e2.get("type") if e1Type == "Protein": return False elif e1Type in ["Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2Type in ["Protein", "Entity"]: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False assert False, (e1Type, e2Type) def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ)], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles["genia_limits"] and not self.isPotentialGeniaInteraction(eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and (eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["rel_limits"] and not self.isPotentialRELInteraction(eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles["co_limits"] and not self.isPotentialCOInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles["bb_limits"] and not self.isPotentialBBInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles["bi_limits"] and not self.isPotentialBIInteraction(eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles["epi_limits"] and not self.isPotentialEPIInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles["id_limits"] and not self.isPotentialIDInteraction(eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI)], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName(sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens(sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # dummy return for speed testing #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): # path = paths[token1][token2] #else: # path = [token1, token2] if not self.styles["no_path"]: # directedPath reduces performance by 0.01 pp #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) #if len(directedPath) == 0: # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) # for dp in directedPath: # dp.reverse() #if len(directedPath) == 0: # path = paths.getPaths(token1, token2) #else: # path = directedPath path = paths.getPaths(token1, token2) if len(path) > 0: #if len(path) > 1: # print len(path) path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False #print token1.get("id"), token2.get("id") assert(self.pathLengths == None) if self.pathLengths == None or len(path)-1 in self.pathLengths: # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if self.styles["trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_limits"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("isName") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("isName") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: # #print "Getting edges" # if token1 != token2 and pathExists: # #print "g1" # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) # #print "g2" # else: # edges = None if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: features[self.featureSet.getId("e1_"+entity1.get("type"))] = 1 features[self.featureSet.getId("e2_"+entity2.get("type"))] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not self.styles["no_linear"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_limits"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_limits"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: #extra["e1GoldIds"] = mergedEntityIds[entity1] extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) #extra["e2GoldIds"] = mergedEntityIds[entity2] extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) # NOTE: temporarily disable for replicating 110310 experiment #features[self.featureSet.getId("extra_constant")] = 1 return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)
class EntityExampleBuilder(ExampleBuilder): def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert( classSet.getId("neg") == 1 ) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName!=None: self.gazetteer=Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from",gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer=None self._setDefaultParameters(["rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "names", "build_for_nameless", "skip_for_nameless", "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", "phospho", "drugbank_features", "ddi13_features", "metamap"]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens() #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() entityIds = set() for entity in entities: if entity.get("given") == "True" and self.styles["all_tokens"]: continue if entity.get("type") == "Entity" and self.styles["genia_task1"]: continue if self.styles["epi_merge_negated"]: types.add(Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType(entity.get("type"))) entityIds.add(entity.get("id")) else: types.add(entity.get("type")) entityIds.add(entity.get("id")) types = list(types) types.sort() typeString = "" for type in types: #if type == "Protein" and "all_tokens" in self.styles: # continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg", None idString = "/".join(sorted(list(entityIds))) if self.styles["limit_merged_types"]: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString, idString else: return typeString.split("---")[0], idString # ids partially incorrect else: return typeString, idString return typeString, idString def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple(phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs(int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_"+attrName+"_"+attrValue.replace(" ", "-")] = 1 def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token], self.tokenFeatureWeights[token] tokTxt=sentenceGraph.getTokenText(token) features = {} features["_txt_"+tokTxt]=1 features["_POS_"+token.get("POS")]=1 if sentenceGraph.tokenIsName[token] and not self.styles["names"]: features["_given"]=1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("given") == "True": features["_annType_"+entity.get("type")]=1 if self.styles["metamap"]: self.getMetaMapFeatures(token, sentenceGraph, features) # # Filip's gazetteer based features (can be used separately from exclude_gazetteer) # if "gazetteer_features" in self.styles: # tokTxtLower = tokTxt.lower() # if "stem_gazetteer" in self.styles: # tokTxtLower = PorterStemmer.stem(tokTxtLower) # if self.gazetteer and tokTxtLower in self.gazetteer: # for label,weight in self.gazetteer[tokTxtLower].items(): # features["_knownLabel_"+label]=weight # 1 performs slightly worse ## BANNER features #if sentenceGraph.entityHintsByToken.has_key(token): # features["BANNER-entity"] = 1 # Wordnet features #if "wordnet" in self.styles: # for wordNetFeature in self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, token.get("POS")): # features["_WN_"+wordNetFeature] = 1 self.tokenFeatures[token] = sorted(features.keys()) self.tokenFeatureWeights[token] = features return self.tokenFeatures[token], self.tokenFeatureWeights[token] def buildLinearOrderFeatures(self,sentenceGraph,index,tag,features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_"+tag tokenFeatures, tokenFeatureWeights = self.getTokenFeatures(sentenceGraph.tokens[index], sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId(tag+tokenFeature)] = tokenFeatureWeights[tokenFeature] def buildLinearNGram(self, i, j, sentenceGraph, features): ngram = "ngram" for index in range(i, j+1): ngram += "_" + sentenceGraph.getTokenText(sentenceGraph.tokens[index]).lower() features[self.featureSet.getId(ngram)] = 1 def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get("origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped buildForNameless = False if structureAnalyzer and not structureAnalyzer.hasGroupClass("GIVEN", "ENTITY"): # no given entities points to no separate NER program being used buildForNameless = True if self.styles["build_for_nameless"]: # manually force the setting buildForNameless = True if self.styles["skip_for_nameless"]: # manually force the setting buildForNameless = False # determine whether sentences with no given entities should be skipped namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens(sentenceGraph) else: for key in sentenceGraph.tokenIsName.keys(): sentenceGraph.tokenIsName[key] = False bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType(sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles["names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) if category == None: self.exampleStats.filter("undefined_class") self.exampleStats.endExample() continue tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_"+text)] = 1 features[self.featureSet.getId("POS_"+token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_"+stem)] = 1 features[self.featureSet.getId("nonstem_"+text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-","").replace("/","").replace(",","").replace("\\","").replace(" ","").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_"+normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_"+norStem)] = 1 features[self.featureSet.getId("nonstem_"+normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_"+stringLower)] = 1 features[self.featureSet.getId("substringstem_"+PorterStemmer.stem(stringLower))] = 1 # Linear order features for index in [-3,-2,-1,1,2,3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i-1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i-2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_"+text[0:2].lower())] = 1 features[self.featureSet.getId("begin_"+text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j-1] == "-": features[self.featureSet.getId("has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_"+text[j-1:j+1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_"+text[j-2:j+1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_"+edgeType)] = 1 features[self.featureSet.getId("t1HIn_"+edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_"+tokenText)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HIn_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_"+edgeType)] = 1 features[self.featureSet.getId("t1HOut_"+edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_"+tokenText)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+edgeType+"_"+tokenStem)] = 1 features[self.featureSet.getId("t1HOut_"+norStem+"_"+edgeType+"_"+tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) # DDI13 features if self.styles["ddi13_features"]: for index in range(len(normalizedText)): features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index+1])] = 1 features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildDrugFeatures(token) self.drugFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_"+wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures(token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) extra = {"xtype":"token","t":token.get("id")} if self.styles["bb_features"]: extra["trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra["goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # chains self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) example = (sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildChains(self,token,sentenceGraph,features,depthLeft=3,chain="",visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_"+strDepthLeft+edgeType)] = 1 nextToken = edge[0] tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("given") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]: features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-frw_"+edgeType)] = 1 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-frw_"+edgeType,edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_"+strDepthLeft+edgeType)] = 1 nextToken = edge[1] tokenFeatures, tokenWeights = self.getTokenFeatures(nextToken, sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId(strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("given") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[nextToken] and not self.styles["names"]: features[self.featureSet.getId("name_chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 features[self.featureSet.getId("chain_dist_"+strDepthLeft+chain+"-rev_"+edgeType)] = 1 self.buildChains(nextToken,sentenceGraph,features,depthLeft-1,chain+"-rev_"+edgeType,edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get("given") == "True": # known data which can be used for features headTokens.append(sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_"+tokenPOS+"-"+headPOS)] = 1
class EdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) assert( classSet.getId("neg") == 1 or (len(classSet.Ids)== 2 and classSet.getId("neg") == -1) ) # Basic style = trigger_features:typed:directed:no_linear:entities:auto_limits:noMasking:maxFeatures self._setDefaultParameters([ "directed", "undirected", "headsOnly", "graph_kernel", "noAnnType", "mask_nodes", "limit_features", "no_auto_limits", "co_features", "genia_features", "bi_features", #"genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "no_trigger_features", "rel_features", "drugbank_features", "ddi_mtmx", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "token_nodes", "skip_extra_triggers", "headsOnly", "graph_kernel", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "linear_features", "subset", "binary", "pos_only", "entity_type", "filter_shortest_path", "maskTypeAsProtein", "keep_neg", "metamap", "sdb_merge", "sdb_features", "ontobiotope_features", "no_self_loops", "full_entities", "no_features", "wordnet", "wordvector", "se10t8_undirected", "filter_types", "doc_extra", "entity_extra"]) self.styles = self.getParameters(style) #if style == None: # no parameters given # style["typed"] = style["directed"] = style["headsOnly"] = True self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet, self.styles) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder(self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["mask_nodes"]: self.multiEdgeFeatureBuilder.maskNamedEntities = True else: self.multiEdgeFeatureBuilder.maskNamedEntities = False if not self.styles["limit_features"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder(self.featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder(self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder(self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder(self.featureSet) if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet, self.styles) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["noAnnType"]: self.triggerFeatureBuilder.noAnnType = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder(featureSet, self.styles) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange(sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if not directed: intEdges = intEdges + sentenceGraph.interactionGraph.getEdges(t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ interactions = sentenceGraph.getInteractions(e1, e2, True) if not directed and not self.styles["se10t8_undirected"]: interactions = interactions + sentenceGraph.getInteractions(e2, e1, True) types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" if self.styles["sdb_merge"]: name = self.mergeForSeeDev(name, self.structureAnalyzer) categoryName += name if categoryName != "": return categoryName else: return "neg" def getBISuperType(self, eType): if eType in ["GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex"]: return "ProteinEntity" elif eType in ["Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter"]: return "GeneEntity" else: return None def getSeeDevSuperTypes(self, eType): if eType in ("Gene", "Gene_Family", "Box", "Promoter"): return ("DNA", "Molecule") elif eType == "RNA": return ("RNA", "DNA_Product", "Molecule") elif eType in ("Protein", "Protein_Family", "Protein_Complex", "Protein_Domain"): return ("Amino_acid_sequence", "DNA_Product", "Molecule") elif eType == "Hormone": return ("Molecule",) elif eType in ("Regulatory_Network", "Pathway"): return ("Dynamic_process",) elif eType in ("Genotype", "Tissue", "Development_Phase"): return ("Biological_context", "Context") elif eType == "Environmental_Factor": return ("Context",) else: raise Exception("Unknown SeeDev type '" + str(eType) + "'") def mergeForSeeDev(self, categoryName, structureAnalyzer): if categoryName in structureAnalyzer.typeMap["forward"]: return structureAnalyzer.typeMap["forward"][categoryName] return categoryName # for tag in ("Regulates", "Exists", "Interacts", "Is", "Occurs"): # if categoryName.startswith(tag): # categoryName = tag # break # return categoryName def processCorpus(self, input, output, gold=None, append=False, allowNewIds=True, structureAnalyzer=None): if self.styles["sdb_merge"]: structureAnalyzer.determineNonOverlappingTypes() self.structureAnalyzer = structureAnalyzer ExampleBuilder.processCorpus(self, input, output, gold, append, allowNewIds, structureAnalyzer) def isValidInteraction(self, e1, e2, structureAnalyzer,forceUndirected=False): return len(structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type"), forceUndirected=forceUndirected)) > 0 def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def filterEdge(self, edge, edgeTypes): import types assert edgeTypes != None if type(edgeTypes) not in [types.ListType, types.TupleType]: edgeTypes = [edgeTypes] if edge[2].get("type") in edgeTypes: return True else: return False def keepExample(self, e1, e2, categoryName, isDirected, structureAnalyzer): makeExample = True if (not self.styles["no_auto_limits"]) and not self.isValidInteraction(e1, e2, structureAnalyzer, forceUndirected=not isDirected): makeExample = False self.exampleStats.filter("auto_limits") if self.styles["genia_task1"] and (e1.get("type") == "Entity" or e2.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if self.styles["no_self_loops"] and ((e1 == e2) or (e1.get("headOffset") == e2.get("headOffset"))): makeExample = False self.exampleStats.filter("no_self_loops") return makeExample def getExampleCategoryName(self, e1=None, e2=None, t1=None, t2=None, sentenceGraph=None, goldGraph=None, entityToGold=None, isDirected=True, structureAnalyzer=None): if self.styles["token_nodes"]: categoryName = self.getCategoryNameFromTokens(sentenceGraph, t1, t2, isDirected) else: categoryName = self.getCategoryName(sentenceGraph, e1, e2, isDirected) if goldGraph != None: categoryName = self.getGoldCategoryName(goldGraph, entityToGold, e1, e2, isDirected) if self.styles["filter_types"] != None and categoryName in self.styles["filter_types"]: categoryName = "neg" if self.styles["se10t8_undirected"]: assert e1.get("id").endswith(".e1") assert e2.get("id").endswith(".e2") #if self.styles["sdb_merge"]: # categoryName = self.mergeForSeeDev(categoryName, structureAnalyzer) return categoryName def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex def buildExamplesForPair(self, token1, token2, paths, sentenceGraph, goldGraph, entityToGold, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): # define forward categoryName = self.getExampleCategoryName(entity1, entity2, token1, token2, sentenceGraph, goldGraph, entityToGold, isDirected, structureAnalyzer=structureAnalyzer) # make forward forwardExample = None self.exampleStats.beginExample(categoryName) if self.keepExample(entity1, entity2, categoryName, isDirected, structureAnalyzer): forwardExample = self.buildExample(token1, token2, paths, sentenceGraph, categoryName, entity1, entity2, structureAnalyzer, isDirected) if isDirected: # build a separate reverse example (if that is valid) self.exampleStats.endExample() # end forward example # define reverse categoryName = self.getExampleCategoryName(entity2, entity1, token2, token1, sentenceGraph, goldGraph, entityToGold, True, structureAnalyzer=structureAnalyzer) # make reverse self.exampleStats.beginExample(categoryName) reverseExample = None if self.keepExample(entity2, entity1, categoryName, True, structureAnalyzer): reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected) self.exampleStats.endExample() return filter(None, [forwardExample, reverseExample]) elif self.styles["se10t8_undirected"]: # undirected example with a directed type self.exampleStats.endExample() return [forwardExample] elif forwardExample != None: # merge features from the reverse example to the forward one reverseExample = self.buildExample(token2, token1, paths, sentenceGraph, categoryName, entity2, entity1, structureAnalyzer, isDirected) forwardExample[1].update(reverseExample[1]) self.exampleStats.endExample() # end merged example return [forwardExample] else: # undirected example that was filtered self.exampleStats.endExample() # end merged example return [] def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, entity1=None, entity2=None, structureAnalyzer=None, isDirected=True): """ Build a single directed example for the potential edge between token1 and token2 """ # define features if not self.styles["no_path"]: path = paths.getPaths(token1, token2) if len(path) > 0: path = path[0] #pathExists = True else: path = [token1, token2] #pathExists = False else: path = [token1, token2] #pathExists = False features = {} if not self.styles["no_features"]: features = self.buildFeatures(sentenceGraph, entity1, entity2, token1, token2, path) # define extra attributes if int(path[0].get("charOffset").split("-")[0]) < int(path[-1].get("charOffset").split("-")[0]): extra = {"xtype":"edge","type":"i","t1":path[0].get("id"),"t2":path[-1].get("id")} extra["deprev"] = False else: extra = {"xtype":"edge","type":"i","t1":path[-1].get("id"),"t2":path[0].get("id")} extra["deprev"] = True if entity1 != None: extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e1DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1]]) if entity2 != None: extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2]]) extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace(":","-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace(":","-COL-") if self.styles["doc_extra"]: if hasattr(sentenceGraph, "documentElement") and sentenceGraph.documentElement.get("origId") != None: extra["DOID"] = sentenceGraph.documentElement.get("origId") if self.styles["entity_extra"]: if entity1.get("origId") != None: extra["e1OID"] = entity1.get("origId") if entity2.get("origId") != None: extra["e2OID"] = entity2.get("origId") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId extra["directed"] = str(isDirected) if self.styles["sdb_merge"]: extra["sdb_merge"] = "True" #print extra return (categoryName, features, extra) def buildFeatures(self, sentenceGraph, entity1, entity2, token1, token2, path): features = {} if not self.styles["no_trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles["bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector(features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures(entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_features"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple(entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple(entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("given") == "True": features[self.featureSet.getId("e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("given") == "True": features[self.featureSet.getId("e2_contains_e1name")] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures(entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector(features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures(sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: e1Type = self.multiEdgeFeatureBuilder.getEntityType(entity1) e2Type = self.multiEdgeFeatureBuilder.getEntityType(entity2) features[self.featureSet.getId("e1_"+e1Type)] = 1 features[self.featureSet.getId("e2_"+e2Type)] = 1 features[self.featureSet.getId("distance_"+str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector(features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures(path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures(path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams(2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams(4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures(path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures(sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector(features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths(sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams(shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if self.styles["linear_features"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures(token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures(token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_features"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert(entity1.get("given") in (None, "False")) if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_target_protein")] = 1 else: features[self.featureSet.getId("GENIA_nested_event")] = 1 if e1Type.find("egulation") != -1: # leave r out to avoid problems with capitalization if entity2.get("given") == "True": features[self.featureSet.getId("GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId("GENIA_regulation_of_event")] = 1 if self.styles["bi_features"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_"+e1Type)] = 1 features[self.featureSet.getId("BI_e2_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_"+e1Type+"_"+e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if self.styles["sdb_features"]: e1Type = entity1.get("type") e2Type = entity2.get("type") features[self.featureSet.getId("SDB_e1_"+e1Type)] = 1 features[self.featureSet.getId("SDB_e2_"+e2Type)] = 1 features[self.featureSet.getId("SDB_e1e2_"+e1Type+"_"+e2Type)] = 1 if e1Type == e2Type: features[self.featureSet.getId("SDB_e1e2_equal")] = 1 features[self.featureSet.getId("SDB_e1e2_equal_" + e1Type)] = 1 e1SuperTypes = str(self.getSeeDevSuperTypes(e1Type)) e2SuperTypes = str(self.getSeeDevSuperTypes(e2Type)) for e1SuperType in e1SuperTypes: for e2SuperType in e2SuperTypes: features[self.featureSet.getId("SDB_e1sup_"+e1SuperType)] = 1 features[self.featureSet.getId("SDB_e2sup_"+e2SuperType)] = 1 features[self.featureSet.getId("SDB_e1e2sup_"+e1SuperType+"_"+e2SuperType)] = 1 if e1SuperType == e2SuperType: features[self.featureSet.getId("SDB_e1e2sup_equal")] = 1 features[self.featureSet.getId("SDB_e1e2sup_equal_" + e1SuperType)] = 1 if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForEntityPair(entity1, entity2) self.ontobiotopeFeatureBuilder.setFeatureVector(None) if self.styles["full_entities"]: e1Text = entity1.get("text").lower() e2Text = entity2.get("text").lower() features[self.featureSet.getId("FULL_e1_"+e1Text)] = 1 features[self.featureSet.getId("FULL_e2_"+e2Text)] = 1 for ep1 in e1Text.split(): for ep2 in e2Text.split(): features[self.featureSet.getId("FULL_e1_"+ep1)] = 1 features[self.featureSet.getId("FULL_e2_"+ep2)] = 1 features[self.featureSet.getId("FULL_e1e2_"+ep1+"_"+ep2)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector(features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["wordnet"]: self.wordNetFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordNetFeatureBuilder.buildFeaturesForEntityPair(token1, token2) self.wordNetFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordNetFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordNetFeatureBuilder.buildPathFeatures(path) self.wordNetFeatureBuilder.setFeatureVector(None) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features, entity1, entity2) self.wordVectorFeatureBuilder.buildFeatures(token1, "t1_") self.wordVectorFeatureBuilder.buildFeatures(token2, "t2_") self.wordVectorFeatureBuilder.buildLinearFeatures(token1, sentenceGraph.tokens, tag="t1_") self.wordVectorFeatureBuilder.buildLinearFeatures(token2, sentenceGraph.tokens, tag="t2_") self.wordVectorFeatureBuilder.buildPathFeatures(path) self.wordVectorFeatureBuilder.buildFBAFeatures(sentenceGraph.tokens, sentenceGraph.tokens.index(token1), sentenceGraph.tokens.index(token2)) self.wordVectorFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures(entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) return features
class EntityExampleBuilder(ExampleBuilder): def __init__(self, style=None, classSet=None, featureSet=None, gazetteerFileName=None, skiplist=None): if classSet == None: classSet = IdSet(1) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) assert (classSet.getId("neg") == 1) #gazetteerFileName="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest/gazetteer-train" if gazetteerFileName != None: self.gazetteer = Gazetteer.loadGztr(gazetteerFileName) print >> sys.stderr, "Loaded gazetteer from", gazetteerFileName else: print >> sys.stderr, "No gazetteer loaded" self.gazetteer = None self._setDefaultParameters([ "rel_features", "wordnet", "bb_features", "giuliano", "epi_merge_negated", "limit_merged_types", "genia_task1", "names", "build_for_nameless", "skip_for_nameless", "pos_only", "all_tokens", "pos_pairs", "linear_ngrams", "phospho", "drugbank_features", "ddi13_features", "metamap", "only_types", "ontobiotope_features", "bb_spans", "wordvector", "no_context" ]) self.styles = self.getParameters(style) # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.skiplist = set() if skiplist != None: f = open(skiplist, "rt") for line in f.readlines(): self.skiplist.add(line.strip()) f.close() if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["wordnet"]: self.wordNetFeatureBuilder = WordNetFeatureBuilder(featureSet) if self.styles["bb_features"]: self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens( ) #self.bacteriaTokens = PhraseTriggerExampleBuilder.getBacteriaTokens(PhraseTriggerExampleBuilder.getBacteriaNames()) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) if self.styles["drugbank_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder = OntoBiotopeFeatureBuilder( self.featureSet) if self.styles["wordvector"]: self.wordVectorFeatureBuilder = WordVectorFeatureBuilder( featureSet, self.styles) def getMergedEntityType(self, entities): """ If a single token belongs to multiple entities of different types, a new, composite type is defined. This type is the alphabetically ordered types of these entities joined with '---'. """ types = set() entityIds = set() limitTypes = self.styles.get("only_types") for entity in entities: eType = entity.get("type") if entity.get("given") == "True" and self.styles["all_tokens"]: continue if eType == "Entity" and self.styles["genia_task1"]: continue if limitTypes and eType not in limitTypes: continue if self.styles["epi_merge_negated"]: types.add( Utils.InteractionXML.ResolveEPITriggerTypes.getEPIBaseType( eType)) entityIds.add(entity.get("id")) else: types.add(eType) entityIds.add(entity.get("id")) types = list(types) types.sort() typeString = "" for type in types: #if type == "Protein" and "all_tokens" in self.styles: # continue if typeString != "": typeString += "---" typeString += type if typeString == "": return "neg", None idString = "/".join(sorted(list(entityIds))) if self.styles["limit_merged_types"]: if typeString.find("---") != -1: if typeString == "Gene_expression---Positive_regulation": return typeString, idString else: return typeString.split( "---")[0], idString # ids partially incorrect else: return typeString, idString return typeString, idString def getMetaMapFeatures(self, token, sentenceGraph, features): analyses = sentenceGraph.sentenceElement.find("analyses") if analyses == None: return metamap = analyses.find("metamap") if metamap == None: return tokenOffset = Range.charOffsetToSingleTuple(token.get("charOffset")) skipAttr = set(["charOffset", "text"]) for phrase in metamap.findall("phrase"): phraseOffset = Range.charOffsetToSingleTuple( phrase.get("charOffset")) if Range.overlap(tokenOffset, phraseOffset): attr = phrase.attrib attrNames = sorted(attr.keys()) for attrName in attrNames: if attrName in skipAttr: continue elif attrName == "score": features["_metamap_score"] = 0.001 * abs( int(attr[attrName])) else: attrValues = attr[attrName].split(",") for attrValue in attrValues: features["_metamap_" + attrName + "_" + attrValue.replace(" ", "-")] = 1 def getTokenFeatures(self, token, sentenceGraph): """ Returns a list of features based on the attributes of a token. These can be used to define more complex features. """ # These features are cached when this method is first called # for a token. if self.tokenFeatures.has_key(token): return self.tokenFeatures[token], self.tokenFeatureWeights[token] tokTxt = sentenceGraph.getTokenText(token) features = {} features["_txt_" + tokTxt] = 1 features["_POS_" + token.get("POS")] = 1 if sentenceGraph.tokenIsName[token] and not self.styles["names"]: features["_given"] = 1 for entity in sentenceGraph.tokenIsEntityHead[token]: if entity.get("given") == "True": features["_annType_" + entity.get("type")] = 1 if self.styles["metamap"]: self.getMetaMapFeatures(token, sentenceGraph, features) # # Filip's gazetteer based features (can be used separately from exclude_gazetteer) # if "gazetteer_features" in self.styles: # tokTxtLower = tokTxt.lower() # if "stem_gazetteer" in self.styles: # tokTxtLower = PorterStemmer.stem(tokTxtLower) # if self.gazetteer and tokTxtLower in self.gazetteer: # for label,weight in self.gazetteer[tokTxtLower].items(): # features["_knownLabel_"+label]=weight # 1 performs slightly worse ## BANNER features #if sentenceGraph.entityHintsByToken.has_key(token): # features["BANNER-entity"] = 1 # Wordnet features #if "wordnet" in self.styles: # for wordNetFeature in self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, token.get("POS")): # features["_WN_"+wordNetFeature] = 1 self.tokenFeatures[token] = sorted(features.keys()) self.tokenFeatureWeights[token] = features return self.tokenFeatures[token], self.tokenFeatureWeights[token] def buildLinearOrderFeatures(self, sentenceGraph, index, tag, features): """ Linear features are built by marking token features with a tag that defines their relative position in the linear order. """ tag = "linear_" + tag tokenFeatures, tokenFeatureWeights = self.getTokenFeatures( sentenceGraph.tokens[index], sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId( tag + tokenFeature)] = tokenFeatureWeights[tokenFeature] def buildLinearNGram(self, i, j, sentenceGraph, features): ngram = "ngram" for index in range(i, j + 1): ngram += "_" + sentenceGraph.getTokenText( sentenceGraph.tokens[index]).lower() features[self.featureSet.getId(ngram)] = 1 def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped buildForNameless = False if structureAnalyzer and not structureAnalyzer.hasGroupClass( "GIVEN", "ENTITY" ): # no given entities points to no separate NER program being used buildForNameless = True if self.styles["build_for_nameless"]: # manually force the setting buildForNameless = True if self.styles["skip_for_nameless"]: # manually force the setting buildForNameless = False # determine whether sentences with no given entities should be skipped namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "given" ) == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) else: for key in sentenceGraph.tokenIsName.keys(): sentenceGraph.tokenIsName[key] = False bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType( sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles[ "names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) if category == None: self.exampleStats.filter("undefined_class") self.exampleStats.endExample() continue tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_" + stringLower)] = 1 features[self.featureSet.getId( "substringstem_" + PorterStemmer.stem(stringLower))] = 1 if not self.styles["no_context"]: # Linear order features for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i - 1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i - 2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_" + text[0:2].lower())] = 1 features[self.featureSet.getId("begin_" + text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) if not self.styles["no_context"]: t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) # DDI13 features if self.styles["ddi13_features"]: for index in range(len(normalizedText)): features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index + 1])] = 1 features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildDrugFeatures(token) self.drugFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures( tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_" + wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures( token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token) self.ontobiotopeFeatureBuilder.setFeatureVector(None) extra = {"xtype": "token", "t": token.get("id")} if self.styles["bb_features"]: extra[ "trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra[ "goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) if self.styles["bb_spans"]: for span in sentenceGraph.sentenceElement.iter("span"): if span.get("headOffset") != token.get("charOffset"): continue #if span.get("source") != "spec": # continue #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id") features[self.featureSet.getId("span_found")] = 1 features[self.featureSet.getId( "span_count")] = 1 + features.get( self.featureSet.getId("span_count"), 0) features[self.featureSet.getId("span_identifier" + span.get("identifier"))] = 1 features[self.featureSet.getId("span_type" + span.get("type"))] = 1 features[self.featureSet.getId("span_category" + span.get("category"))] = 1 features[self.featureSet.getId("span_source" + span.get("source"))] = 1 if "define_offset" in extra: prevOffset = [ int(x) for x in extra["define_offset"].split("-") ] assert len(prevOffset) == 2 newOffset = [ int(x) for x in span.get("charOffset").split("-") ] assert len(newOffset) == 2 prevOffsetRange = abs(prevOffset[0] - prevOffset[1]) newOffsetRange = abs(newOffset[0] - newOffset[1]) if newOffsetRange > prevOffsetRange: extra["define_offset"] = span.get("charOffset") else: extra["define_offset"] = span.get("charOffset") features[self.featureSet.getId("span_count_" + str( features.get(self.featureSet.getId("span_count"), 0)))] = 1 # chains if not self.styles["no_context"]: self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features) self.wordVectorFeatureBuilder.buildFeatures(token) self.wordVectorFeatureBuilder.setFeatureVector(None) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildChains(self, token, sentenceGraph, features, depthLeft=3, chain="", visited=None): if depthLeft == 0: return strDepthLeft = "dist_" + str(depthLeft) if visited == None: visited = set() inEdges = self.inEdgesByToken[token] outEdges = self.outEdgesByToken[token] edgeSet = visited.union(self.edgeSetByToken[token]) for edge in inEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_" + strDepthLeft + edgeType)] = 1 nextToken = edge[0] tokenFeatures, tokenWeights = self.getTokenFeatures( nextToken, sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId( strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("given") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[ nextToken] and not self.styles["names"]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-frw_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-frw_" + edgeType, edgeSet) for edge in outEdges: if not edge in visited: edgeType = edge[2].get("type") features[self.featureSet.getId("dep_dist_" + strDepthLeft + edgeType)] = 1 nextToken = edge[1] tokenFeatures, tokenWeights = self.getTokenFeatures( nextToken, sentenceGraph) for tokenFeature in tokenFeatures: features[self.featureSet.getId( strDepthLeft + tokenFeature)] = tokenWeights[tokenFeature] # for entity in sentenceGraph.tokenIsEntityHead[nextToken]: # if entity.get("given") == "True": # features[self.featureSet.getId("name_dist_"+strDepthLeft)] = 1 # features[self.featureSet.getId("name_dist_"+strDepthLeft+entity.get("type"))] = 1 # features[self.featureSet.getId("POS_dist_"+strDepthLeft+nextToken.get("POS"))] = 1 # tokenText = sentenceGraph.getTokenText(nextToken) # features[self.featureSet.getId("text_dist_"+strDepthLeft+tokenText)] = 1 if sentenceGraph.tokenIsName[ nextToken] and not self.styles["names"]: features[self.featureSet.getId("name_chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 features[self.featureSet.getId("chain_dist_" + strDepthLeft + chain + "-rev_" + edgeType)] = 1 self.buildChains(nextToken, sentenceGraph, features, depthLeft - 1, chain + "-rev_" + edgeType, edgeSet) def getNamedEntityHeadTokens(self, sentenceGraph): headTokens = [] for entity in sentenceGraph.entities: if entity.get( "given" ) == "True": # known data which can be used for features headTokens.append( sentenceGraph.entityHeadTokenByEntity[entity]) return headTokens def buildPOSPairs(self, token, namedEntityHeadTokens, features): tokenPOS = token.get("POS") assert tokenPOS != None for headToken in namedEntityHeadTokens: headPOS = headToken.get("POS") features[self.featureSet.getId("POS_pair_NE_" + tokenPOS + "-" + headPOS)] = 1
class MultiEdgeExampleBuilder(ExampleBuilder): """ This example builder makes edge examples, i.e. examples describing the event arguments. """ def __init__(self, style=None, length=None, types=[], featureSet=None, classSet=None): if featureSet == None: featureSet = IdSet() if classSet == None: classSet = IdSet(1) else: classSet = classSet assert (classSet.getId("neg") == 1 or (len(classSet.Ids) == 2 and classSet.getId("neg") == -1)) ExampleBuilder.__init__(self, classSet=classSet, featureSet=featureSet) self.styles = self.getParameters(style, [ "typed", "directed", "headsOnly", "graph_kernel", "noAnnType", "noMasking", "maxFeatures", "genia_limits", "epi_limits", "id_limits", "rel_limits", "bb_limits", "bi_limits", "co_limits", "genia_task1", "ontology", "nodalida", "bacteria_renaming", "trigger_features", "rel_features", "ddi_features", "evex", "giuliano", "random", "themeOnly", "causeOnly", "no_path", "entities", "skip_extra_triggers", "headsOnly", "graph_kernel", "trigger_features", "no_task", "no_dependency", "disable_entity_features", "disable_terminus_features", "disable_single_element_features", "disable_ngram_features", "disable_path_edge_features", "no_linear", "subset", "binary", "pos_only", "entity_type" ]) if style == None: # no parameters given style["typed"] = style["directed"] = style["headsOnly"] = True # self.styles = style # if "selftrain_group" in self.styles: # self.selfTrainGroups = set() # if "selftrain_group-1" in self.styles: # self.selfTrainGroups.add("-1") # if "selftrain_group0" in self.styles: # self.selfTrainGroups.add("0") # if "selftrain_group1" in self.styles: # self.selfTrainGroups.add("1") # if "selftrain_group2" in self.styles: # self.selfTrainGroups.add("2") # if "selftrain_group3" in self.styles: # self.selfTrainGroups.add("3") # print >> sys.stderr, "Self-train-groups:", self.selfTrainGroups self.multiEdgeFeatureBuilder = MultiEdgeFeatureBuilder(self.featureSet) # NOTE Temporarily re-enabling predicted range #self.multiEdgeFeatureBuilder.definePredictedValueRange([], None) if self.styles["graph_kernel"]: from FeatureBuilders.GraphKernelFeatureBuilder import GraphKernelFeatureBuilder self.graphKernelFeatureBuilder = GraphKernelFeatureBuilder( self.featureSet) if self.styles["noAnnType"]: self.multiEdgeFeatureBuilder.noAnnType = True if self.styles["noMasking"]: self.multiEdgeFeatureBuilder.maskNamedEntities = False if self.styles["maxFeatures"]: self.multiEdgeFeatureBuilder.maximum = True if self.styles["genia_task1"]: self.multiEdgeFeatureBuilder.filterAnnTypes.add("Entity") self.tokenFeatureBuilder = TokenFeatureBuilder(self.featureSet) if self.styles["ontology"]: self.multiEdgeFeatureBuilder.ontologyFeatureBuilder = BioInferOntologyFeatureBuilder( self.featureSet) if self.styles["nodalida"]: self.nodalidaFeatureBuilder = NodalidaFeatureBuilder( self.featureSet) if self.styles["bacteria_renaming"]: self.bacteriaRenamingFeatureBuilder = BacteriaRenamingFeatureBuilder( self.featureSet) if self.styles["trigger_features"]: self.triggerFeatureBuilder = TriggerFeatureBuilder(self.featureSet) self.triggerFeatureBuilder.useNonNameEntities = True if self.styles["genia_task1"]: self.triggerFeatureBuilder.filterAnnTypes.add("Entity") #self.bioinferOntologies = OntologyUtils.loadOntologies(OntologyUtils.g_bioInferFileName) if self.styles["rel_features"]: self.relFeatureBuilder = RELFeatureBuilder(featureSet) if self.styles["ddi_features"]: self.drugFeatureBuilder = DrugFeatureBuilder(featureSet) if self.styles["evex"]: self.evexFeatureBuilder = EVEXFeatureBuilder(featureSet) if self.styles["giuliano"]: self.giulianoFeatureBuilder = GiulianoFeatureBuilder(featureSet) self.pathLengths = length assert (self.pathLengths == None) self.types = types if self.styles["random"]: from FeatureBuilders.RandomFeatureBuilder import RandomFeatureBuilder self.randomFeatureBuilder = RandomFeatureBuilder(self.featureSet) def definePredictedValueRange(self, sentences, elementName): self.multiEdgeFeatureBuilder.definePredictedValueRange( sentences, elementName) def getPredictedValueRange(self): return self.multiEdgeFeatureBuilder.predictedRange def filterEdgesByType(self, edges, typesToInclude): if len(typesToInclude) == 0: return edges edgesToKeep = [] for edge in edges: if edge.get("type") in typesToInclude: edgesToKeep.append(edge) return edgesToKeep def getCategoryNameFromTokens(self, sentenceGraph, t1, t2, directed=True): """ Example class. Multiple overlapping edges create a merged type. """ types = set() # if sentenceGraph.interactionGraph.has_edge(t1, t2): # intEdges = sentenceGraph.interactionGraph.get_edge_data(t1, t2, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) # if (not directed) and sentenceGraph.interactionGraph.has_edge(t2, t1): # intEdges = sentenceGraph.interactionGraph.get_edge(t2, t1, default={}) # # NOTE: Only works if keys are ordered integers # for i in range(len(intEdges)): # types.add(intEdges[i]["element"].get("type")) intEdges = sentenceGraph.interactionGraph.getEdges(t1, t2) if (not directed): intEdges = intEdges + sentenceGraph.interactionGraph.getEdges( t2, t1) for intEdge in intEdges: types.add(intEdge[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def getCategoryName(self, sentenceGraph, e1, e2, directed=True, duplicateEntities=None): """ Example class. Multiple overlapping edges create a merged type. """ # interactions = [] # e1s = [e1] # if duplicateEntities != None and e1 in duplicateEntities: # e1s += duplicateEntities[e1] # e2s = [e2] # if duplicateEntities != None and e2 in duplicateEntities: # e2s += duplicateEntities[e2] # for entity1 in e1s: # for entity2 in e2s: # interactions = interactions + sentenceGraph.getInteractions(entity1, entity2) # if not directed: # interactions = interactions + sentenceGraph.getInteractions(entity2, entity1) interactions = sentenceGraph.getInteractions(e1, e2, True) #print interactions types = set() for interaction in interactions: types.add(interaction[2].get("type")) types = list(types) types.sort() categoryName = "" for name in types: if self.styles["causeOnly"] and name != "Cause": continue if self.styles["themeOnly"] and name != "Theme": continue if categoryName != "": categoryName += "---" categoryName += name if categoryName != "": return categoryName else: return "neg" def isPotentialRELInteraction(self, e1, e2): if e1.get("type") == "Protein" and e2.get("type") == "Entity": return True else: return False def isPotentialBBInteraction(self, e1, e2, sentenceGraph): #if e1.get("type") == "Bacterium" and e2.get("type") in ["Host", "HostPart", "Geographical", "Environmental", "Food", "Medical", "Soil", "Water"]: # Note: "Environment" type is misspelled as "Environmental" in the BB-task documentation if e1.get("type") == "Bacterium" and e2.get("type") in [ "Host", "HostPart", "Geographical", "Environment", "Food", "Medical", "Soil", "Water" ]: return True elif e1.get("type") == "Host" and e2.get("type") == "HostPart": return True else: return False def getBISuperType(self, eType): if eType in [ "GeneProduct", "Protein", "ProteinFamily", "PolymeraseComplex" ]: return "ProteinEntity" elif eType in [ "Gene", "GeneFamily", "GeneComplex", "Regulon", "Site", "Promoter" ]: return "GeneEntity" else: return None def isPotentialBIInteraction(self, e1, e2, sentenceGraph, stats): e1Type = e1.get("type") e1SuperType = self.getBISuperType(e1Type) e2Type = e2.get("type") e2SuperType = self.getBISuperType(e2Type) tag = "(" + e1Type + "/" + e2Type + ")" if e1Type == "Regulon": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType == "ProteinEntity": if e2Type in ["Site", "Promoter", "Gene", "GeneComplex"]: return True if e1Type in ["Action", "Transcription", "Expression"]: return True if e1Type == "Site": if e2SuperType == "GeneEntity": return True if e1Type == "Promoter": if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True if e1SuperType in ["GeneEntity", "ProteinEntity"]: if e2SuperType in ["GeneEntity", "ProteinEntity"]: return True stats.filter("bi_limits") #+tag) return False def isPotentialEPIInteraction(self, e1, e2, sentenceGraph): if e1.get("type") != "Catalysis": if e1.get("type") in ["Protein", "Entity"]: return False elif e2.get("type") in ["Protein", "Entity"]: return True else: return False else: # Catalysis if e2.get("type") != "Entity": return True else: return False assert False, (e1.get("type"), e2.get("type")) def isPotentialIDInteraction(self, e1, e2, sentenceGraph): e1Type = e1.get("type") e2Type = e2.get("type") e1IsCore = e1Type in [ "Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism" ] e2IsCore = e2Type in [ "Protein", "Regulon-operon", "Two-component-system", "Chemical", "Organism" ] if e1IsCore: return False elif e1Type in ["Gene_expression", "Transcription"]: if e2Type in ["Protein", "Regulon-operon"]: return True else: return False elif e1Type in ["Protein_catabolism", "Phosphorylation"]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2IsCore or e2Type == "Entity": return True else: return False elif e1Type in ["Binding", "Process"]: if e2IsCore: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False elif e1Type == "Entity": if e2IsCore: return True else: return False assert False, (e1Type, e2Type) def isPotentialCOInteraction(self, e1, e2, sentenceGraph): if e1.get("type") == "Exp" and e2.get("type") == "Exp": anaphoraTok = sentenceGraph.entityHeadTokenByEntity[e1] antecedentTok = sentenceGraph.entityHeadTokenByEntity[e2] antecedentTokenFound = False for token in sentenceGraph.tokens: if token == antecedentTok: antecedentTokenFound = True if token == anaphoraTok: # if, not elif, to take into accoutn cases where e1Tok == e2Tok if antecedentTokenFound: return True else: return False assert False elif e1.get("type") == "Exp" and e2.get("type") == "Protein": return True else: return False def isPotentialGeniaInteraction(self, e1, e2): e1Type = e1.get("type") e2Type = e2.get("type") if e1Type == "Protein": return False elif e1Type in [ "Entity", "Gene_expression", "Transcription", "Protein_catabolism", "Phosphorylation", "Binding" ]: if e2Type == "Protein": return True else: return False elif e1Type == "Localization": if e2Type in ["Protein", "Entity"]: return True else: return False elif "egulation" in e1Type: if e2Type != "Entity": return True else: return False assert False, (e1Type, e2Type) def getGoldCategoryName(self, goldGraph, entityToGold, e1, e2, directed=True): if len(entityToGold[e1]) > 0 and len(entityToGold[e2]) > 0: return self.getCategoryName(goldGraph, entityToGold[e1][0], entityToGold[e2][0], directed=directed) else: return "neg" def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 if self.styles["trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # Filter entities, if needed #mergedIds = None #duplicateEntities = None #entities = sentenceGraph.entities #entities, mergedIds, duplicateEntities = self.mergeEntities(sentenceGraph, False) # "no_duplicates" in self.styles) sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities( entities, goldGraph.entities) paths = None if not self.styles["no_path"]: ##undirected = sentenceGraph.getUndirectedDependencyGraph() #undirected = self.nxMultiDiGraphToUndirected(sentenceGraph.dependencyGraph) ###undirected = sentenceGraph.dependencyGraph.to_undirected() ####undirected = NX10.MultiGraph(sentenceGraph.dependencyGraph) This didn't work undirected = sentenceGraph.dependencyGraph.toUndirected() #paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) paths = undirected #for edge in sentenceGraph.dependencyGraph.edges: # assert edge[2] != None #for edge in undirected.edges: # assert edge[2] != None #if sentenceGraph.sentenceElement.get("id") == "GENIA.d70.s5": # print [(x[0].get("id"), x[1].get("id"), x[2].get("id")) for x in sentenceGraph.dependencyGraph.edges] # Generate examples based on interactions between entities or interactions between tokens if self.styles["entities"]: loopRange = len(entities) else: loopRange = len(sentenceGraph.tokens) for i in range(loopRange - 1): for j in range(i + 1, loopRange): eI = None eJ = None if self.styles["entities"]: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] #if "no_ne_interactions" in self.styles and eI.get("isName") == "True" and eJ.get("isName") == "True": # continue if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get( "source") != None: continue else: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len( sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue if self.styles["directed"]: # define forward if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eI, eJ, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, True) # make forward self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eI, eJ): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eI, eJ): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eI.get("type") + "/" + eJ.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eI, eJ, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eI, eJ, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ) ExampleUtils.appendExamples([ self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) ], outfile) exampleIndex += 1 self.exampleStats.endExample() # define reverse if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eJ, eI, True) if goldGraph != None: categoryName = self.getGoldCategoryName( goldGraph, entityToGold, eJ, eI, True) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tJ, tI, True) # make reverse self.exampleStats.beginExample(categoryName) makeExample = True if self.styles[ "genia_limits"] and not self.isPotentialGeniaInteraction( eJ, eI): makeExample = False self.exampleStats.filter("genia_limits") if self.styles["genia_task1"] and ( eI.get("type") == "Entity" or eJ.get("type") == "Entity"): makeExample = False self.exampleStats.filter("genia_task1") if self.styles[ "rel_limits"] and not self.isPotentialRELInteraction( eJ, eI): makeExample = False self.exampleStats.filter("rel_limits") if self.styles[ "co_limits"] and not self.isPotentialCOInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("co_limits") if self.styles[ "bb_limits"] and not self.isPotentialBBInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("bb_limits") if categoryName != "neg": self.exampleStats.filter("bb_limits(" + categoryName + ":" + eJ.get("type") + "/" + eI.get("type") + ")") if self.styles[ "bi_limits"] and not self.isPotentialBIInteraction( eJ, eI, sentenceGraph, self.exampleStats): makeExample = False #self.exampleStats.filter("bi_limits") if self.styles[ "epi_limits"] and not self.isPotentialEPIInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("epi_limits") if self.styles[ "id_limits"] and not self.isPotentialIDInteraction( eJ, eI, sentenceGraph): makeExample = False self.exampleStats.filter("id_limits") # if self.styles["selftrain_limits"] and (eI.get("selftrain") == "False" or eJ.get("selftrain") == "False"): # makeExample = False # self.exampleStats.filter("selftrain_limits") # if self.styles["selftrain_group"] and (eI.get("selftraingroup") not in self.selfTrainGroups or eJ.get("selftraingroup") not in self.selfTrainGroups): # makeExample = False # self.exampleStats.filter("selftrain_group") if self.styles["pos_only"] and categoryName == "neg": makeExample = False self.exampleStats.filter("pos_only") if makeExample: #examples.append( self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ) ExampleUtils.appendExamples([ self.buildExample(tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) ], outfile) exampleIndex += 1 self.exampleStats.endExample() else: if self.styles["entities"]: categoryName = self.getCategoryName( sentenceGraph, eI, eJ, False) else: categoryName = self.getCategoryNameFromTokens( sentenceGraph, tI, tJ, False) self.exampleStats.beginExample(categoryName) forwardExample = self.buildExample(tI, tJ, paths, sentenceGraph, categoryName, exampleIndex, eI, eJ) if not self.styles["graph_kernel"]: reverseExample = self.buildExample( tJ, tI, paths, sentenceGraph, categoryName, exampleIndex, eJ, eI) forwardExample[2].update(reverseExample[2]) #examples.append(forwardExample) ExampleUtils.appendExamples([forwardExample], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex def buildExample(self, token1, token2, paths, sentenceGraph, categoryName, exampleIndex, entity1=None, entity2=None): """ Build a single directed example for the potential edge between token1 and token2 """ # dummy return for speed testing #return (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),1,{},{}) # define features features = {} if True: #token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): #if token1 != token2 and paths.has_key(token1) and paths[token1].has_key(token2): # path = paths[token1][token2] #else: # path = [token1, token2] if not self.styles["no_path"]: # directedPath reduces performance by 0.01 pp #directedPath = sentenceGraph.dependencyGraph.getPaths(token1, token2) #if len(directedPath) == 0: # directedPath = sentenceGraph.dependencyGraph.getPaths(token2, token1) # for dp in directedPath: # dp.reverse() #if len(directedPath) == 0: # path = paths.getPaths(token1, token2) #else: # path = directedPath path = paths.getPaths(token1, token2) if len(path) > 0: #if len(path) > 1: # print len(path) path = path[0] pathExists = True else: path = [token1, token2] pathExists = False else: path = [token1, token2] pathExists = False #print token1.get("id"), token2.get("id") assert (self.pathLengths == None) if self.pathLengths == None or len(path) - 1 in self.pathLengths: # if not "no_ontology" in self.styles: # self.ontologyFeatureBuilder.setFeatureVector(features) # self.ontologyFeatureBuilder.buildOntologyFeaturesForPath(sentenceGraph, path) # self.ontologyFeatureBuilder.setFeatureVector(None) if self.styles["trigger_features"]: # F 85.52 -> 85.55 self.triggerFeatureBuilder.setFeatureVector(features) self.triggerFeatureBuilder.tag = "trg1_" self.triggerFeatureBuilder.buildFeatures(token1) self.triggerFeatureBuilder.tag = "trg2_" self.triggerFeatureBuilder.buildFeatures(token2) self.triggerFeatureBuilder.setFeatureVector(None) # REL features if self.styles["rel_features"] and not self.styles["no_task"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.tag = "rel1_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token1)) self.relFeatureBuilder.tag = "rel2_" self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, sentenceGraph.tokens.index(token2)) self.relFeatureBuilder.setFeatureVector(None) if self.styles[ "bacteria_renaming"] and not self.styles["no_task"]: self.bacteriaRenamingFeatureBuilder.setFeatureVector( features) self.bacteriaRenamingFeatureBuilder.buildPairFeatures( entity1, entity2) #self.bacteriaRenamingFeatureBuilder.buildSubstringFeatures(entity1, entity2) # decreases perf. 74.76 -> 72.41 self.bacteriaRenamingFeatureBuilder.setFeatureVector(None) if self.styles["co_limits"] and not self.styles["no_task"]: e1Offset = Range.charOffsetToSingleTuple( entity1.get("charOffset")) e2Offset = Range.charOffsetToSingleTuple( entity2.get("charOffset")) if Range.contains(e1Offset, e2Offset): features[self.featureSet.getId("e1_contains_e2")] = 1 if entity2.get("isName") == "True": features[self.featureSet.getId( "e1_contains_e2name")] = 1 if Range.contains(e2Offset, e1Offset): features[self.featureSet.getId("e2_contains_e1")] = 1 if entity1.get("isName") == "True": features[self.featureSet.getId( "e2_contains_e1name")] = 1 if self.styles["ddi_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildPairFeatures(entity1, entity2) if self.styles["ddi_mtmx"]: self.drugFeatureBuilder.buildMTMXFeatures( entity1, entity2) self.drugFeatureBuilder.setFeatureVector(None) #if "graph_kernel" in self.styles or not "no_dependency" in self.styles: # #print "Getting edges" # if token1 != token2 and pathExists: # #print "g1" # edges = self.multiEdgeFeatureBuilder.getEdges(sentenceGraph.dependencyGraph, path) # #print "g2" # else: # edges = None if self.styles["graph_kernel"]: self.graphKernelFeatureBuilder.setFeatureVector( features, entity1, entity2) self.graphKernelFeatureBuilder.buildGraphKernelFeatures( sentenceGraph, path) self.graphKernelFeatureBuilder.setFeatureVector(None) if self.styles["entity_type"]: features[self.featureSet.getId("e1_" + entity1.get("type"))] = 1 features[self.featureSet.getId("e2_" + entity2.get("type"))] = 1 features[self.featureSet.getId("distance_" + str(len(path)))] = 1 if not self.styles["no_dependency"]: #print "Dep features" self.multiEdgeFeatureBuilder.setFeatureVector( features, entity1, entity2) #self.multiEdgeFeatureBuilder.buildStructureFeatures(sentenceGraph, paths) # remove for fast if not self.styles["disable_entity_features"]: self.multiEdgeFeatureBuilder.buildEntityFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.buildPathLengthFeatures(path) if not self.styles["disable_terminus_features"]: self.multiEdgeFeatureBuilder.buildTerminusTokenFeatures( path, sentenceGraph) # remove for fast if not self.styles["disable_single_element_features"]: self.multiEdgeFeatureBuilder.buildSingleElementFeatures( path, sentenceGraph) if not self.styles["disable_ngram_features"]: #print "NGrams" self.multiEdgeFeatureBuilder.buildPathGrams( 2, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 3, path, sentenceGraph) # remove for fast self.multiEdgeFeatureBuilder.buildPathGrams( 4, path, sentenceGraph) # remove for fast #self.buildEdgeCombinations(path, edges, sentenceGraph, features) # remove for fast #if edges != None: # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[0], edges[0][1]+edges[1][0], "t1", sentenceGraph) # remove for fast # self.multiEdgeFeatureBuilder.buildTerminusFeatures(path[-1], edges[len(path)-1][len(path)-2]+edges[len(path)-2][len(path)-1], "t2", sentenceGraph) # remove for fast if not self.styles["disable_path_edge_features"]: self.multiEdgeFeatureBuilder.buildPathEdgeFeatures( path, sentenceGraph) self.multiEdgeFeatureBuilder.buildSentenceFeatures( sentenceGraph) self.multiEdgeFeatureBuilder.setFeatureVector(None) if self.styles["nodalida"]: self.nodalidaFeatureBuilder.setFeatureVector( features, entity1, entity2) shortestPaths = self.nodalidaFeatureBuilder.buildShortestPaths( sentenceGraph.dependencyGraph, path) print shortestPaths if len(shortestPaths) > 0: self.nodalidaFeatureBuilder.buildNGrams( shortestPaths, sentenceGraph) self.nodalidaFeatureBuilder.setFeatureVector(None) if not self.styles["no_linear"]: self.tokenFeatureBuilder.setFeatureVector(features) for i in range(len(sentenceGraph.tokens)): if sentenceGraph.tokens[i] == token1: token1Index = i if sentenceGraph.tokens[i] == token2: token2Index = i linearPreTag = "linfw_" if token1Index > token2Index: token1Index, token2Index = token2Index, token1Index linearPreTag = "linrv_" self.tokenFeatureBuilder.buildLinearOrderFeatures( token1Index, sentenceGraph, 2, 2, preTag="linTok1") self.tokenFeatureBuilder.buildLinearOrderFeatures( token2Index, sentenceGraph, 2, 2, preTag="linTok2") # Before, middle, after # self.tokenFeatureBuilder.buildTokenGrams(0, token1Index-1, sentenceGraph, "bf") # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, "bw") # self.tokenFeatureBuilder.buildTokenGrams(token2Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, "af") # before-middle, middle, middle-after # self.tokenFeatureBuilder.buildTokenGrams(0, token2Index-1, sentenceGraph, linearPreTag+"bf", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, token2Index-1, sentenceGraph, linearPreTag+"bw", max=2) # self.tokenFeatureBuilder.buildTokenGrams(token1Index+1, len(sentenceGraph.tokens)-1, sentenceGraph, linearPreTag+"af", max=2) self.tokenFeatureBuilder.setFeatureVector(None) if self.styles["random"]: self.randomFeatureBuilder.setFeatureVector(features) self.randomFeatureBuilder.buildRandomFeatures(100, 0.01) self.randomFeatureBuilder.setFeatureVector(None) if self.styles["genia_limits"] and not self.styles["no_task"]: e1Type = entity1.get("type") e2Type = entity2.get("type") assert (entity1.get("isName") == "False") if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_target_protein")] = 1 else: features[self.featureSet.getId( "GENIA_nested_event")] = 1 if e1Type.find( "egulation" ) != -1: # leave r out to avoid problems with capitalization if entity2.get("isName") == "True": features[self.featureSet.getId( "GENIA_regulation_of_protein")] = 1 else: features[self.featureSet.getId( "GENIA_regulation_of_event")] = 1 if self.styles["bi_limits"]: # Make features based on entity types e1Type = entity1.get("type") e2Type = entity2.get("type") e1SuperType = str(self.getBISuperType(e1Type)) e2SuperType = str(self.getBISuperType(e2Type)) features[self.featureSet.getId("BI_e1_" + e1Type)] = 1 features[self.featureSet.getId("BI_e2_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1sup_" + e1SuperType)] = 1 features[self.featureSet.getId("BI_e2sup_" + e2SuperType)] = 1 features[self.featureSet.getId("BI_e1e2_" + e1Type + "_" + e2Type)] = 1 features[self.featureSet.getId("BI_e1e2sup_" + e1SuperType + "_" + e2SuperType)] = 1 if self.styles["evex"]: self.evexFeatureBuilder.setFeatureVector( features, entity1, entity2) self.evexFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.evexFeatureBuilder.setFeatureVector(None) if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector( features, entity1, entity2) self.giulianoFeatureBuilder.buildEdgeFeatures( entity1, entity2, token1, token2, path, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 else: features[self.featureSet.getId("always_negative")] = 1 if self.styles["subset"]: features[self.featureSet.getId("out_of_scope")] = 1 path = [token1, token2] # define extra attributes #if int(path[0].get("id").split("_")[-1]) < int(path[-1].get("id").split("_")[-1]): if int(path[0].get("charOffset").split("-")[0]) < int( path[-1].get("charOffset").split("-")[0]): #extra = {"xtype":"edge","type":"i","t1":path[0],"t2":path[-1]} extra = { "xtype": "edge", "type": "i", "t1": path[0].get("id"), "t2": path[-1].get("id") } extra["deprev"] = False else: #extra = {"xtype":"edge","type":"i","t1":path[-1],"t2":path[0]} extra = { "xtype": "edge", "type": "i", "t1": path[-1].get("id"), "t2": path[0].get("id") } extra["deprev"] = True if entity1 != None: #extra["e1"] = entity1 extra["e1"] = entity1.get("id") if sentenceGraph.mergedEntityToDuplicates != None: #extra["e1GoldIds"] = mergedEntityIds[entity1] extra["e1DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity1] ]) if entity2 != None: #extra["e2"] = entity2 extra["e2"] = entity2.get("id") if sentenceGraph.mergedEntityToDuplicates != None: extra["e2DuplicateIds"] = ",".join([ x.get("id") for x in sentenceGraph.mergedEntityToDuplicates[entity2] ]) #extra["e2GoldIds"] = mergedEntityIds[entity2] extra["categoryName"] = categoryName if self.styles["bacteria_renaming"]: if entity1.get("text") != None and entity1.get("text") != "": extra["e1t"] = entity1.get("text").replace(" ", "---").replace( ":", "-COL-") if entity2.get("text") != None and entity2.get("text") != "": extra["e2t"] = entity2.get("text").replace(" ", "---").replace( ":", "-COL-") sentenceOrigId = sentenceGraph.sentenceElement.get("origId") if sentenceOrigId != None: extra["SOID"] = sentenceOrigId # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 categoryName = "i" else: category = self.classSet.getId(categoryName) # NOTE: temporarily disable for replicating 110310 experiment #features[self.featureSet.getId("extra_constant")] = 1 return (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra)