예제 #1
0
def buildAdjacencyMatrixWithShortestPaths(tokenElements,
                                          dependencyElements,
                                          entityElements,
                                          pairElement,
                                          directed=True,
                                          weight_by_distance=False):
    #Punctuation dependencies are mostly junk
    dependencyElements = removeDependencies(dependencyElements, ["punct"])

    parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements)
    parseGraph.markNamedEntities(entityElements)
    e1Id = pairElement.get("e1")
    e2Id = pairElement.get("e2")
    entity1TokenIds = parseGraph.getNamedEntityTokenIds([e1Id])
    entity2TokenIds = parseGraph.getNamedEntityTokenIds([e2Id])

    binaryPaths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds)
    shortestPaths = ParseGraph.getShortestPaths(binaryPaths)
    parseGraph.setAllDependencyWeights(0.3)
    parseGraph.setDependencyWeightsByPath(shortestPaths, 0.9)
    if weight_by_distance:
        parseGraph.reduceWeightByDistance(0.9, 0.5)
    parseGraph.setPPIPrefixForDependencies("sp", 0.9)  # shortest path prefix
    parseGraph.maskNames(e1Id, e2Id)
    parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds)

    if pairElement.get("interaction") == "True":
        output = 1.
    else:
        output = -1.

    adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype)
    return adjMatrix, labels, output
예제 #2
0
def buildAdjacencyMatrixWithShortestPaths(tokenElements, dependencyElements, entityElements, pairElement, directed = True, weight_by_distance = False):
    #Punctuation dependencies are mostly junk
    dependencyElements = removeDependencies(dependencyElements, ["punct"])
    
    parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements)
    parseGraph.markNamedEntities(entityElements)
    e1Id = pairElement.get("e1")
    e2Id = pairElement.get("e2")
    entity1TokenIds = parseGraph.getNamedEntityTokenIds( [e1Id] )
    entity2TokenIds = parseGraph.getNamedEntityTokenIds( [e2Id] )

    binaryPaths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds)
    shortestPaths = ParseGraph.getShortestPaths(binaryPaths)
    parseGraph.setAllDependencyWeights(0.3)
    parseGraph.setDependencyWeightsByPath(shortestPaths, 0.9)
    if weight_by_distance:
        parseGraph.reduceWeightByDistance(0.9, 0.5)
    parseGraph.setPPIPrefixForDependencies("sp", 0.9) # shortest path prefix
    parseGraph.maskNames(e1Id, e2Id)
    parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds)
    
    if pairElement.get("interaction") == "True":
        output = 1.
    else:
        output = -1.

    adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype)
    return adjMatrix, labels, output    
예제 #3
0
def buildAdjacencyMatrix(tokenElements, dependencyElements,
        entityElements, metamapElements, pairElement, matrixSettings):
    """
    Parameters
    ----------
    tokenElements : cElementTree.Element
        List of <token> elements, which represent all the tokens of the
        sentence. Each <token> contains an ID, the text, POS tag and
        character offset of the token.
    dependencyElements : cElementTree.Element
        List of <dependency> elements, which represent all the
        dependencies of the (dependency-parsed) sentence. Each
        <dependency> contains an ID, a source, a target and dependency
        type.
    entityElements : cElementTree.Element
        List of <entity> elements, which represent protein mentions of
        the sentence. Each <entity> contains an ID, the text and the
        character offsets of the mention.
    metamapElements: ??? or None
        There is code to extract metamappings from an analysis XML file
        (in GraphMatrices.build_sentence_dict), but there are no example
        corpora which have these attributes!
    pairElement : cElementTree.Element
        a <pair> element, which contains an ID, the IDs of both
        entities and truth value stating whether there's an interaction
        between the two.
    matrixSettings : MatrixSettings
        contains settings for creating an adjacency matrix

    Returns
    -------
    matrix_tuple : tuple of (adjMatrix, labels, output)
        TODO ???
    """
    m = matrixSettings
    
    #Punctuation dependencies are mostly junk
    dependencyElements = removeDependencies(dependencyElements, m.removeDependencies)
    
    parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements, m.mergeDependencies)
    parseGraph.shortestPathMethod = "dijkstra"
    parseGraph.markNamedEntities(entityElements)
    e1Id = pairElement.get("e1")
    e2Id = pairElement.get("e2")
    entity1TokenIds = parseGraph.getNamedEntityTokenIds( [e1Id] )
    entity2TokenIds = parseGraph.getNamedEntityTokenIds( [e2Id] )
    interactionWordTokenIds = parseGraph.getTokenIdsByText(m.interactionWords, False)

    # Give dependencies base weights
    parseGraph.setAllDependencyWeights(m.depBaseWeight)

    # Set dependencies' weights based on paths
    pathStyles = ParseGraph.splitPathStyles(m.paths)
    for style in pathStyles:
        paths = []
        if style["type"] == "binary":
            paths = parseGraph.buildBinaryPaths(entity1TokenIds, entity2TokenIds, style["length"], style["direction"]=="directed", m.pathTimeout)
        elif style["type"].find("tertiary") != -1:
            paths = parseGraph.buildTertiaryPaths(entity1TokenIds, interactionWordTokenIds, entity2TokenIds, style["type"]=="closest_tertiary", style["length"], style["direction"]=="directed", m.pathTimeout)
            if m.markInteractionWords == MatrixSettings.markedInteractionWords.fromTertiaryPaths:
                parseGraph.setPPIInteractionWords(paths)
            for i in range(len(paths)):
                paths[i] = paths[i][0]
        if paths != None:
            parseGraph.setDependencyWeightsByPath(paths, style["weight"])
    
    # Reduce dependencies' weights by distance from threshold
    if m.weightByDistance:
        parseGraph.reduceWeightByDistance(m.depWeightReductionThreshold, m.depWeightReductionFactor)
    # Set dependency prefixes
    if m.depPrefixThreshold > 0.0:
        parseGraph.setPPIPrefixForDependencies(m.depPrefix, m.depPrefixThreshold) # f.e. shortest path prefix
    
    # Set token texts
    if m.tokenPPIText == MatrixSettings.ppiTexts.full:
        parseGraph.ppiTextFromOriginalText()
    elif m.tokenPPIText == MatrixSettings.ppiTexts.stem:
        parseGraph.ppiTextFromStems() # FIXME: not implemented!
    else:
        print >> sys.stderr, "Illegal ppiText setting", m.tokenPPIText
        sys.exit(1)
    
    # Add metamap codes
    if metamapElements != None:
        metamapDict = {}
        for metamapElement in metamapElements:
            metamapDict[metamapElement.get("tokenid")] = metamapElement.get("basecodes").split(",")
        parseGraph.addMetamapCodes(metamapDict)
    
    if m.maskPPIText:
        parseGraph.maskNames(e1Id, e2Id)
    if m.tokenPositionTags:
        parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds)

    output = 1. if pairElement.get("interaction") == "True" else -1.

    adjMatrix, labels = parseGraph.buildAdjacencyMatrix(floattype, m.directed, m.linearOrderWeight)
    return adjMatrix, labels, output    
예제 #4
0
def buildAdjacencyMatrix(tokenElements, dependencyElements, entityElements,
                         metamapElements, pairElement, matrixSettings):
    """
    Parameters
    ----------
    tokenElements : cElementTree.Element
        List of <token> elements, which represent all the tokens of the
        sentence. Each <token> contains an ID, the text, POS tag and
        character offset of the token.
    dependencyElements : cElementTree.Element
        List of <dependency> elements, which represent all the
        dependencies of the (dependency-parsed) sentence. Each
        <dependency> contains an ID, a source, a target and dependency
        type.
    entityElements : cElementTree.Element
        List of <entity> elements, which represent protein mentions of
        the sentence. Each <entity> contains an ID, the text and the
        character offsets of the mention.
    metamapElements: ??? or None
        There is code to extract metamappings from an analysis XML file
        (in GraphMatrices.build_sentence_dict), but there are no example
        corpora which have these attributes!
    pairElement : cElementTree.Element
        a <pair> element, which contains an ID, the IDs of both
        entities and truth value stating whether there's an interaction
        between the two.
    matrixSettings : MatrixSettings
        contains settings for creating an adjacency matrix

    Returns
    -------
    matrix_tuple : tuple of (adjMatrix, labels, output)
        TODO ???
    """
    m = matrixSettings

    #Punctuation dependencies are mostly junk
    dependencyElements = removeDependencies(dependencyElements,
                                            m.removeDependencies)

    parseGraph = ParseGraph.ParseGraph(tokenElements, dependencyElements,
                                       m.mergeDependencies)
    parseGraph.shortestPathMethod = "dijkstra"
    parseGraph.markNamedEntities(entityElements)
    e1Id = pairElement.get("e1")
    e2Id = pairElement.get("e2")
    entity1TokenIds = parseGraph.getNamedEntityTokenIds([e1Id])
    entity2TokenIds = parseGraph.getNamedEntityTokenIds([e2Id])
    interactionWordTokenIds = parseGraph.getTokenIdsByText(
        m.interactionWords, False)

    # Give dependencies base weights
    parseGraph.setAllDependencyWeights(m.depBaseWeight)

    # Set dependencies' weights based on paths
    pathStyles = ParseGraph.splitPathStyles(m.paths)
    for style in pathStyles:
        paths = []
        if style["type"] == "binary":
            paths = parseGraph.buildBinaryPaths(
                entity1TokenIds, entity2TokenIds, style["length"],
                style["direction"] == "directed", m.pathTimeout)
        elif style["type"].find("tertiary") != -1:
            paths = parseGraph.buildTertiaryPaths(
                entity1TokenIds, interactionWordTokenIds, entity2TokenIds,
                style["type"] == "closest_tertiary", style["length"],
                style["direction"] == "directed", m.pathTimeout)
            if m.markInteractionWords == MatrixSettings.markedInteractionWords.fromTertiaryPaths:
                parseGraph.setPPIInteractionWords(paths)
            for i in range(len(paths)):
                paths[i] = paths[i][0]
        if paths != None:
            parseGraph.setDependencyWeightsByPath(paths, style["weight"])

    # Reduce dependencies' weights by distance from threshold
    if m.weightByDistance:
        parseGraph.reduceWeightByDistance(m.depWeightReductionThreshold,
                                          m.depWeightReductionFactor)
    # Set dependency prefixes
    if m.depPrefixThreshold > 0.0:
        parseGraph.setPPIPrefixForDependencies(
            m.depPrefix, m.depPrefixThreshold)  # f.e. shortest path prefix

    # Set token texts
    if m.tokenPPIText == MatrixSettings.ppiTexts.full:
        parseGraph.ppiTextFromOriginalText()
    elif m.tokenPPIText == MatrixSettings.ppiTexts.stem:
        parseGraph.ppiTextFromStems()  # FIXME: not implemented!
    else:
        print >> sys.stderr, "Illegal ppiText setting", m.tokenPPIText
        sys.exit(1)

    # Add metamap codes
    if metamapElements != None:
        metamapDict = {}
        for metamapElement in metamapElements:
            metamapDict[metamapElement.get("tokenid")] = metamapElement.get(
                "basecodes").split(",")
        parseGraph.addMetamapCodes(metamapDict)

    if m.maskPPIText:
        parseGraph.maskNames(e1Id, e2Id)
    if m.tokenPositionTags:
        parseGraph.addPositionTags(entity1TokenIds, entity2TokenIds)

    output = 1. if pairElement.get("interaction") == "True" else -1.

    adjMatrix, labels = parseGraph.buildAdjacencyMatrix(
        floattype, m.directed, m.linearOrderWeight)
    return adjMatrix, labels, output