def buildAdjacencyMatrixWithShortestPaths(tokenElements, dependencyElements, entityElements, pairElement, directed=True, weight_by_distance=False): #Punctuation dependencies are mostly junk dependencyElements = removeDependencies(dependencyElements, ["punct"]) parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements) parseGraph.markNamedEntities(entityElements) e1Id = pairElement.get("e1") e2Id = pairElement.get("e2") entity1TokenIds = parseGraph.getNamedEntityTokenIds([e1Id]) entity2TokenIds = parseGraph.getNamedEntityTokenIds([e2Id]) binaryPaths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds) shortestPaths = ParseGraph.getShortestPaths(binaryPaths) parseGraph.setAllDependencyWeights(0.3) parseGraph.setDependencyWeightsByPath(shortestPaths, 0.9) if weight_by_distance: parseGraph.reduceWeightByDistance(0.9, 0.5) parseGraph.setPPIPrefixForDependencies("sp", 0.9) # shortest path prefix parseGraph.maskNames(e1Id, e2Id) parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds) if pairElement.get("interaction") == "True": output = 1. else: output = -1. adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype) return adjMatrix, labels, output
def buildAdjacencyMatrixWithShortestPaths(tokenElements, dependencyElements, entityElements, pairElement, directed = True, weight_by_distance = False): #Punctuation dependencies are mostly junk dependencyElements = removeDependencies(dependencyElements, ["punct"]) parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements) parseGraph.markNamedEntities(entityElements) e1Id = pairElement.get("e1") e2Id = pairElement.get("e2") entity1TokenIds = parseGraph.getNamedEntityTokenIds( [e1Id] ) entity2TokenIds = parseGraph.getNamedEntityTokenIds( [e2Id] ) binaryPaths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds) shortestPaths = ParseGraph.getShortestPaths(binaryPaths) parseGraph.setAllDependencyWeights(0.3) parseGraph.setDependencyWeightsByPath(shortestPaths, 0.9) if weight_by_distance: parseGraph.reduceWeightByDistance(0.9, 0.5) parseGraph.setPPIPrefixForDependencies("sp", 0.9) # shortest path prefix parseGraph.maskNames(e1Id, e2Id) parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds) if pairElement.get("interaction") == "True": output = 1. else: output = -1. adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype) return adjMatrix, labels, output
def buildAdjacencyMatrix(tokenElements, dependencyElements, entityElements, metamapElements, pairElement, matrixSettings): """ Parameters ---------- tokenElements : cElementTree.Element List of <token> elements, which represent all the tokens of the sentence. Each <token> contains an ID, the text, POS tag and character offset of the token. dependencyElements : cElementTree.Element List of <dependency> elements, which represent all the dependencies of the (dependency-parsed) sentence. Each <dependency> contains an ID, a source, a target and dependency type. entityElements : cElementTree.Element List of <entity> elements, which represent protein mentions of the sentence. Each <entity> contains an ID, the text and the character offsets of the mention. metamapElements: ??? or None There is code to extract metamappings from an analysis XML file (in GraphMatrices.build_sentence_dict), but there are no example corpora which have these attributes! pairElement : cElementTree.Element a <pair> element, which contains an ID, the IDs of both entities and truth value stating whether there's an interaction between the two. matrixSettings : MatrixSettings contains settings for creating an adjacency matrix Returns ------- matrix_tuple : tuple of (adjMatrix, labels, output) TODO ??? """ m = matrixSettings #Punctuation dependencies are mostly junk dependencyElements = removeDependencies(dependencyElements, m.removeDependencies) parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements, m.mergeDependencies) parseGraph.shortestPathMethod = "dijkstra" parseGraph.markNamedEntities(entityElements) e1Id = pairElement.get("e1") e2Id = pairElement.get("e2") entity1TokenIds = parseGraph.getNamedEntityTokenIds( [e1Id] ) entity2TokenIds = parseGraph.getNamedEntityTokenIds( [e2Id] ) interactionWordTokenIds = parseGraph.getTokenIdsByText(m.interactionWords, False) # Give dependencies base weights parseGraph.setAllDependencyWeights(m.depBaseWeight) # Set dependencies' weights based on paths pathStyles = ParseGraph.splitPathStyles(m.paths) for style in pathStyles: paths = [] if style["type"] == "binary": paths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds, style["length"], style["direction"]=="directed", m.pathTimeout) elif style["type"].find("tertiary") != -1: paths = parseGraph.buildTertiaryPaths(entity1TokenIds, interactionWordTokenIds, entity2TokenIds, style["type"]=="closest_tertiary", style["length"], style["direction"]=="directed", m.pathTimeout) if m.markInteractionWords == MatrixSettings.markedInteractionWords.fromTertiaryPaths: parseGraph.setPPIInteractionWords(paths) for i in range(len(paths)): paths[i] = paths[i][0] if paths != None: parseGraph.setDependencyWeightsByPath(paths, style["weight"]) # Reduce dependencies' weights by distance from threshold if m.weightByDistance: parseGraph.reduceWeightByDistance(m.depWeightReductionThreshold, m.depWeightReductionFactor) # Set dependency prefixes if m.depPrefixThreshold > 0.0: parseGraph.setPPIPrefixForDependencies(m.depPrefix, m.depPrefixThreshold) # f.e. shortest path prefix # Set token texts if m.tokenPPIText == MatrixSettings.ppiTexts.full: parseGraph.ppiTextFromOriginalText() elif m.tokenPPIText == MatrixSettings.ppiTexts.stem: parseGraph.ppiTextFromStems() # FIXME: not implemented! else: print >> sys.stderr, "Illegal ppiText setting", m.tokenPPIText sys.exit(1) # Add metamap codes if metamapElements != None: metamapDict = {} for metamapElement in metamapElements: metamapDict[metamapElement.get("tokenid")] = metamapElement.get("basecodes").split(",") parseGraph.addMetamapCodes(metamapDict) if m.maskPPIText: parseGraph.maskNames(e1Id, e2Id) if m.tokenPositionTags: parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds) output = 1. if pairElement.get("interaction") == "True" else -1. adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype, m.directed, m.linearOrderWeight) return adjMatrix, labels, output
def buildAdjacencyMatrix(tokenElements, dependencyElements, entityElements, metamapElements, pairElement, matrixSettings): """ Parameters ---------- tokenElements : cElementTree.Element List of <token> elements, which represent all the tokens of the sentence. Each <token> contains an ID, the text, POS tag and character offset of the token. dependencyElements : cElementTree.Element List of <dependency> elements, which represent all the dependencies of the (dependency-parsed) sentence. Each <dependency> contains an ID, a source, a target and dependency type. entityElements : cElementTree.Element List of <entity> elements, which represent protein mentions of the sentence. Each <entity> contains an ID, the text and the character offsets of the mention. metamapElements: ??? or None There is code to extract metamappings from an analysis XML file (in GraphMatrices.build_sentence_dict), but there are no example corpora which have these attributes! pairElement : cElementTree.Element a <pair> element, which contains an ID, the IDs of both entities and truth value stating whether there's an interaction between the two. matrixSettings : MatrixSettings contains settings for creating an adjacency matrix Returns ------- matrix_tuple : tuple of (adjMatrix, labels, output) TODO ??? """ m = matrixSettings #Punctuation dependencies are mostly junk dependencyElements = removeDependencies(dependencyElements, m.removeDependencies) parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements, m.mergeDependencies) parseGraph.shortestPathMethod = "dijkstra" parseGraph.markNamedEntities(entityElements) e1Id = pairElement.get("e1") e2Id = pairElement.get("e2") entity1TokenIds = parseGraph.getNamedEntityTokenIds([e1Id]) entity2TokenIds = parseGraph.getNamedEntityTokenIds([e2Id]) interactionWordTokenIds = parseGraph.getTokenIdsByText( m.interactionWords, False) # Give dependencies base weights parseGraph.setAllDependencyWeights(m.depBaseWeight) # Set dependencies' weights based on paths pathStyles = ParseGraph.splitPathStyles(m.paths) for style in pathStyles: paths = [] if style["type"] == "binary": paths = parseGraph.buildBinaryPaths( entity1TokenIds, entity2TokenIds, style["length"], style["direction"] == "directed", m.pathTimeout) elif style["type"].find("tertiary") != -1: paths = parseGraph.buildTertiaryPaths( entity1TokenIds, interactionWordTokenIds, entity2TokenIds, style["type"] == "closest_tertiary", style["length"], style["direction"] == "directed", m.pathTimeout) if m.markInteractionWords == MatrixSettings.markedInteractionWords.fromTertiaryPaths: parseGraph.setPPIInteractionWords(paths) for i in range(len(paths)): paths[i] = paths[i][0] if paths != None: parseGraph.setDependencyWeightsByPath(paths, style["weight"]) # Reduce dependencies' weights by distance from threshold if m.weightByDistance: parseGraph.reduceWeightByDistance(m.depWeightReductionThreshold, m.depWeightReductionFactor) # Set dependency prefixes if m.depPrefixThreshold > 0.0: parseGraph.setPPIPrefixForDependencies( m.depPrefix, m.depPrefixThreshold) # f.e. shortest path prefix # Set token texts if m.tokenPPIText == MatrixSettings.ppiTexts.full: parseGraph.ppiTextFromOriginalText() elif m.tokenPPIText == MatrixSettings.ppiTexts.stem: parseGraph.ppiTextFromStems() # FIXME: not implemented! else: print >> sys.stderr, "Illegal ppiText setting", m.tokenPPIText sys.exit(1) # Add metamap codes if metamapElements != None: metamapDict = {} for metamapElement in metamapElements: metamapDict[metamapElement.get("tokenid")] = metamapElement.get( "basecodes").split(",") parseGraph.addMetamapCodes(metamapDict) if m.maskPPIText: parseGraph.maskNames(e1Id, e2Id) if m.tokenPositionTags: parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds) output = 1. if pairElement.get("interaction") == "True" else -1. adjMatrix, labels = parseGraph.buildAdjacencyMatrix( floattype, m.directed, m.linearOrderWeight) return adjMatrix, labels, output