Exemplo n.º 1
0
 def __init__(self):
     super(Phyloxml_Parser, self).__init__()
     self.phyloTree = PhyloTree()
     self.tagsOfInterest = {
         "clade": "",
         "name": "name",
         "branch_length": "length",
         "confidence": "bootstrap",
         "events": "events"
     }
Exemplo n.º 2
0
    def _parseNewickToJson(self, newickString, treeName=None, nameMap=None):
        """parses a newick representation of a tree into a PhyloTree data structure,
        which can be easily converted to json"""
        self.phyloTree = PhyloTree()
        newickString = self.cleanNewickString(newickString)
        if nameMap:
            newickString = self._mapName(newickString, nameMap)

        self.phyloTree.root = self.parseNode(newickString, 0)
        if nameMap:
            self.phyloTree.addAttributesToRoot({"treeName": treeName})

        return self.phyloTree.generateJsonableDict()
Exemplo n.º 3
0
 def __init__(self):
     super(Phyloxml_Parser, self).__init__()
     self.phyloTree = PhyloTree()
     self.tagsOfInterest = {
         "clade": "",
         "name" : "name",
         "branch_length" : "length",
         "confidence"    : "bootstrap",
         "events"        : "events"
     }
Exemplo n.º 4
0
    def _parseNewickToJson(self, newickString, treeName=None, nameMap=None):
        """parses a newick representation of a tree into a PhyloTree data structure,
        which can be easily converted to json"""
        self.phyloTree = PhyloTree()
        newickString = self.cleanNewickString(newickString)
        if nameMap:
            newickString = self._mapName(newickString, nameMap)

        self.phyloTree.root = self.parseNode(newickString, 0)
        if nameMap:
            self.phyloTree.addAttributesToRoot({"treeName": treeName})

        return self.phyloTree.generateJsonableDict()
Exemplo n.º 5
0
class Newick_Parser(Base_Parser):
    """For parsing trees stored in the newick format (.nhx)
    It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well.."""
    def __init__(self):
        super(Newick_Parser, self).__init__()

    def parseFile(self, filePath):
        """Parses a newick file to obtain the string inside. Returns: jsonableDict"""
        with open(filePath, "r") as newickFile:
            newickString = newickFile.read()
            newickString = newickString.replace("\n", "").replace("\r", "")
            return [self.parseData(newickString)], "Success"

    def parseData(self, newickString):
        """To be called on a newickString directly to parse it. Returns: jsonableDict"""
        return self._parseNewickToJson(newickString)

    def _parseNewickToJson(self, newickString, treeName=None, nameMap=None):
        """parses a newick representation of a tree into a PhyloTree data structure,
        which can be easily converted to json"""
        self.phyloTree = PhyloTree()
        newickString = self.cleanNewickString(newickString)
        if nameMap:
            newickString = self._mapName(newickString, nameMap)

        self.phyloTree.root = self.parseNode(newickString, 0)
        if nameMap:
            self.phyloTree.addAttributesToRoot({"treeName": treeName})

        return self.phyloTree.generateJsonableDict()

    def cleanNewickString(self, rawNewick):
        """removing semi colon, and illegal json characters (\,',") and white spaces"""
        return re.sub(r'\s|;|\"|\'|\\', '', rawNewick)

    def _makeNodesFromString(self, string, depth):
        """elements separated by comma could be empty"""

        if string.find("(") != -1:
            raise Exception("Tree is not well form, location: " + string)

        childrenString = string.split(",")
        childrenNodes = []

        for childString in childrenString:
            if len(childString) == 0:
                continue
            nodeInfo = childString.split(":")
            name, length, bootstrap = "", None, -1
            if len(nodeInfo) == 2:  # has length info
                length = nodeInfo[1]
                # checking for bootstap values
                name = nodeInfo[0]
                try:  # Nexus may bootstrap in names position
                    name = float(name)
                    if 0 <= name <= 1:
                        bootstrap = name
                    elif 1 <= name <= 100:
                        bootstrap = name / 100
                    name = ""
                except ValueError:
                    name = nodeInfo[0]
            else:
                name = nodeInfo[0]  # string only contains name
            node = self.phyloTree.makeNode(name,
                                           length=length,
                                           depth=depth,
                                           bootstrap=bootstrap)
            childrenNodes += [node]
        return childrenNodes

    def _mapName(self, newickString, nameMap):
        """
        Necessary to replace names of terms inside nexus representation
        Also, its here because Mailaud's doesnt deal with id_strings outside of quotes(" ")
        """
        newString = ""
        start = 0
        end = 0

        for i in xrange(len(newickString)):
            if newickString[i] == "(" or newickString[i] == ",":
                if re.match(r"[,(]", newickString[i + 1:]):
                    continue
                else:
                    end = i + 1
                    # i now refers to the starting position of the term to be replaced,
                    # we will next find j which is the ending pos of the term
                    for j in xrange(i + 1, len(newickString)):
                        enclosingSymbol = newickString[
                            j]  # the immediate symbol after a common or left bracket which denotes the end of a term
                        if enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",":
                            termToReplace = newickString[end:j]

                            newString += newickString[start:end] + nameMap[
                                termToReplace]  #+ "'"  "'" +
                            start = j
                            break

        newString += newickString[start:]
        return newString

    def parseNode(self, string, depth):
        """
        Recursive method for parsing newick string, works by stripping down the string into substring
        of newick contained with brackers, which is used to call itself.

        Eg ... ( A, B, (D, E)C, F, G ) ...

        We will make the preceeding nodes first A, B, then the internal node C, its children D, E,
        and finally the succeeding nodes F, G
        """

        # Base case where there is only an empty string
        if string == "":
            return
            # Base case there its only an internal claude
        if string.find("(") == -1:
            return self._makeNodesFromString(string, depth)

        nodes, children = [], [
        ]  # nodes refer to the nodes on this level, children refers to the child of the
        start = 0
        lenOfPreceedingInternalNodeString = 0
        bracketStack = []

        for j in xrange(len(string)):
            if string[
                    j] == "(":  #finding the positions of all the open brackets
                bracketStack.append(j)
                continue
            if string[
                    j] == ")":  #finding the positions of all the closed brackets to extract claude
                i = bracketStack.pop()

                if len(bracketStack) == 0:  # is child of current node

                    InternalNode = None

                    #First flat call to make nodes of the same depth but from the preceeding string.
                    startSubstring = string[
                        start + lenOfPreceedingInternalNodeString:i]
                    preceedingNodes = self._makeNodesFromString(
                        startSubstring, depth)
                    nodes += preceedingNodes

                    # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it.
                    if j + 1 < len(string):
                        stringRightOfBracket = string[
                            j +
                            1:]  # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3
                        match = re.search(r"[\)\,\(]", stringRightOfBracket)
                        if match:
                            indexOfNextSymbol = match.start()
                            stringRepOfInternalNode = stringRightOfBracket[:
                                                                           indexOfNextSymbol]
                            internalNodes = self._makeNodesFromString(
                                stringRepOfInternalNode, depth)
                            if len(internalNodes) > 0:
                                InternalNode = internalNodes[0]
                            lenOfPreceedingInternalNodeString = len(
                                stringRepOfInternalNode)
                        else:  # sometimes the node can be the last element of a string
                            InternalNode = self._makeNodesFromString(
                                string[j + 1:], depth)[0]
                            lenOfPreceedingInternalNodeString = len(string) - j
                    if InternalNode == None:  #creating a generic node if it is unnamed
                        InternalNode = self.phyloTree.makeNode(
                            "", depth=depth,
                            isInternal=True)  #"internal-" + str(depth)
                        lenOfPreceedingInternalNodeString = 0

                    # recussive call to make the internal claude
                    childSubString = string[i + 1:j]
                    InternalNode.addChildNode(
                        self.parseNode(childSubString, depth + 1))

                    nodes.append(
                        InternalNode
                    )  # we append the internal node later to preserve order

                    start = j + 1
                continue

        if depth == 0:  # if its the root node, we do nothing about it and return
            return nodes[0]

        # Adding last most set of children
        endString = string[start:]
        if string[
                start -
                1] == ")":  # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse
            match = re.search(r"[\)\,\(]", endString)
            if match:
                endOfNodeName = start + match.start() + 1
                endString = string[endOfNodeName:]
                nodes += self._makeNodesFromString(endString, depth)

        return nodes
Exemplo n.º 6
0
class Phyloxml_Parser(Base_Parser):
    """Parses a phyloxml file into a json file that will be passed to PhyloViz for display"""

    def __init__(self):
        super(Phyloxml_Parser, self).__init__()
        self.phyloTree = PhyloTree()
        self.tagsOfInterest = {
            "clade": "",
            "name" : "name",
            "branch_length" : "length",
            "confidence"    : "bootstrap",
            "events"        : "events"
        }

    def parseFile(self, filePath):
        """passes a file and extracts its Phylogeny Tree content."""
        phyloXmlFile = open(filePath, "r")

        xmlTree = ElementTree.parse(phyloXmlFile)
        xmlRoot = xmlTree.getroot()[0]
        self.nameSpaceIndex = xmlRoot.tag.rfind("}") + 1  # used later by the clean tag method to remove the name space in every element.tag

        phyloRoot = None
        for child in xmlRoot:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                phyloRoot = child
            elif childTag == "name":
                self.phyloTree.title = child.text

        self.phyloTree.root = self.parseNode(phyloRoot, 0)
        jsonDict = self.phyloTree.generateJsonableDict()
        return [jsonDict], "Success"

    def parseNode(self, node, depth):
        """Parses any node within a phyloxml tree and looks out for claude, which signals the creation of
        nodes - internal OR leaf"""

        tag = self.cleanTag(node.tag)
        if not tag == "clade":
            return None
        hasInnerClade = False

        # peeking once for parent and once for child to check if the node is internal
        for child in node:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                hasInnerClade = True
                break

        if hasInnerClade:       # this node is an internal node
            currentNode = self._makeInternalNode(node, depth=depth)
            for child in node:
                child = self.parseNode(child, depth + 1)
                if isinstance(child, Node):
                    currentNode.addChildNode(child)

        else:                   # this node is a leaf node
            currentNode = self._makeLeafNode(node, depth=depth + 1)

        return currentNode

    def _makeLeafNode(self, leafNode, depth=0 ):
        """Makes leaf nodes by calling Phylotree methods"""
        node = {}
        for child in leafNode:
            childTag = self.cleanTag(child.tag)
            if childTag in self.tagsOfInterest:
                key = self.tagsOfInterest[childTag]    # need to map phyloxml terms to ours
                node[key] = child.text

        node["depth"] = depth
        return self.phyloTree.makeNode(self._getNodeName(leafNode), **node)

    def _getNodeName(self, node, depth=-1):
        """Gets the name of a claude. It handles the case where a taxonomy node is involved"""

        def getTagFromTaxonomyNode(node):
            """Returns the name of a taxonomy node. A taxonomy node have to be treated differently as the name
            is embedded one level deeper"""
            phyloxmlTaxoNames = {
                "common_name" : "",
                "scientific_name" : "",
                "code"  : ""
            }
            for child in node:
                childTag = self.cleanTag(child.tag)
                if childTag in phyloxmlTaxoNames:
                    return child.text
            return ""

        nodeName = ""
        for child in node:
            childTag = self.cleanTag(child.tag)
            if childTag == "name" :
                nodeName = child.text
                break
            elif childTag == "taxonomy":
                nodeName = getTagFromTaxonomyNode(child)
                break

        return nodeName

    def _makeInternalNode(self, internalNode, depth=0):
        """ Makes an internal node from an element object that is guranteed to be a parent node.
        Gets the value of interests like events and appends it to a custom node object that will be passed to PhyloTree to make nodes
        """
        node = {}
        for child in internalNode:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                continue
            elif childTag in self.tagsOfInterest:
                if childTag == "events":    # events is nested 1 more level deeper than others
                    key, text = "events", self.cleanTag(child[0].tag)
                else:
                    key = self.tagsOfInterest[childTag]
                    text = child.text
                node[key] = text

        return self.phyloTree.makeNode(self._getNodeName(internalNode, depth), **node)

    def cleanTag(self, tagString):
        return tagString[self.nameSpaceIndex:]
Exemplo n.º 7
0
class Newick_Parser(Base_Parser):
    """For parsing trees stored in the newick format (.nhx)
    It is necessarily more complex because this parser is later extended by Nexus for parsing newick as well.."""


    def __init__(self):
        super(Newick_Parser, self).__init__()


    def parseFile(self, filePath):
        """Parses a newick file to obtain the string inside. Returns: jsonableDict"""
        with open(filePath, "r") as newickFile:
            newickString = newickFile.read()
            newickString = newickString.replace("\n", "").replace("\r", "")
            return [self.parseData(newickString)], "Success"


    def parseData(self, newickString):
        """To be called on a newickString directly to parse it. Returns: jsonableDict"""
        return self._parseNewickToJson(newickString)


    def _parseNewickToJson(self, newickString, treeName=None, nameMap=None):
        """parses a newick representation of a tree into a PhyloTree data structure,
        which can be easily converted to json"""
        self.phyloTree = PhyloTree()
        newickString = self.cleanNewickString(newickString)
        if nameMap:
            newickString = self._mapName(newickString, nameMap)

        self.phyloTree.root = self.parseNode(newickString, 0)
        if nameMap:
            self.phyloTree.addAttributesToRoot({"treeName": treeName})

        return self.phyloTree.generateJsonableDict()


    def cleanNewickString(self, rawNewick):
        """removing semi colon, and illegal json characters (\,',") and white spaces"""
        return re.sub(r'\s|;|\"|\'|\\', '', rawNewick)


    def _makeNodesFromString(self, string, depth):
        """elements separated by comma could be empty"""

        if string.find("(") != -1:
            raise Exception("Tree is not well form, location: " + string)

        childrenString = string.split(",")
        childrenNodes = []

        for childString in childrenString:
            if len(childString) == 0:
                continue
            nodeInfo = childString.split(":")
            name, length, bootstrap = "", None, -1
            if len(nodeInfo) == 2: # has length info
                length = nodeInfo[1]
                # checking for bootstap values
                name = nodeInfo[0]
                try:    # Nexus may bootstrap in names position
                    name = float(name)
                    if 0<= name <= 1:
                        bootstrap = name
                    elif 1 <= name <= 100:
                        bootstrap = name / 100
                    name = ""
                except ValueError:
                    name = nodeInfo[0]
            else:
                name = nodeInfo[0]      # string only contains name
            node = self.phyloTree.makeNode(name, length=length, depth=depth, bootstrap= bootstrap)
            childrenNodes += [node]
        return childrenNodes

    def _mapName(self, newickString, nameMap):
        """
        Necessary to replace names of terms inside nexus representation
        Also, it's here because Mailaud's doesnt deal with id_strings outside of quotes(" ")
        """
        newString = ""
        start = 0
        end = 0

        for i in xrange(len(newickString)):
            if newickString[i] == "(" or newickString[i] == ",":
                if re.match(r"[,(]", newickString[i+1:]):
                    continue
                else:
                    end = i + 1
                    # i now refers to the starting position of the term to be replaced,
                    # we will next find j which is the ending pos of the term
                    for j in xrange(i+1, len(newickString)):
                        enclosingSymbol = newickString[j]   # the immediate symbol after a common or left bracket which denotes the end of a term
                        if  enclosingSymbol == ")" or enclosingSymbol == ":" or enclosingSymbol == ",":
                            termToReplace =  newickString[end:j]

                            newString += newickString[start : end] +  nameMap[termToReplace] #+ "'"  "'" +
                            start = j
                            break

        newString += newickString[start:]
        return newString

    def parseNode(self, string, depth):
        """
        Recursive method for parsing newick string, works by stripping down the string into substring
        of newick contained with brackers, which is used to call itself.

        Eg ... ( A, B, (D, E)C, F, G ) ...

        We will make the preceeding nodes first A, B, then the internal node C, its children D, E,
        and finally the succeeding nodes F, G
        """

        # Base case where there is only an empty string
        if string == "":
            return
            # Base case there it's only an internal claude
        if string.find("(") == -1:
            return self._makeNodesFromString(string, depth)

        nodes, children = [], []      # nodes refer to the nodes on this level, children refers to the child of the
        start = 0
        lenOfPreceedingInternalNodeString = 0
        bracketStack = []

        for j in xrange(len(string)):
            if string[j] == "(":    #finding the positions of all the open brackets
                bracketStack.append(j)
                continue
            if string[j] == ")":    #finding the positions of all the closed brackets to extract claude
                i = bracketStack.pop()

                if len(bracketStack) == 0:  # is child of current node

                    InternalNode = None

                    #First flat call to make nodes of the same depth but from the preceeding string.
                    startSubstring = string[start + lenOfPreceedingInternalNodeString: i]
                    preceedingNodes =  self._makeNodesFromString(startSubstring, depth)
                    nodes += preceedingNodes

                    # Then We will try to see if the substring has any internal nodes first, make it then make nodes preceeding it and succeeding it.
                    if j + 1 < len(string):
                        stringRightOfBracket = string[j+1:]      # Eg. '(b:0.4,a:0.3)c:0.3, stringRightOfBracket = c:0.3
                        match = re.search(r"[\)\,\(]", stringRightOfBracket)
                        if match:
                            indexOfNextSymbol = match.start()
                            stringRepOfInternalNode = stringRightOfBracket[:indexOfNextSymbol]
                            internalNodes = self._makeNodesFromString( stringRepOfInternalNode, depth)
                            if len(internalNodes) > 0:
                                InternalNode = internalNodes[0]
                            lenOfPreceedingInternalNodeString = len(stringRepOfInternalNode)
                        else:   # sometimes the node can be the last element of a string
                            InternalNode = self._makeNodesFromString(string[j+1:], depth)[0]
                            lenOfPreceedingInternalNodeString = len(string) - j
                    if InternalNode == None:       #creating a generic node if it is unnamed
                        InternalNode = self.phyloTree.makeNode( "", depth=depth, isInternal=True ) #"internal-" + str(depth)
                        lenOfPreceedingInternalNodeString = 0

                    # recussive call to make the internal claude
                    childSubString = string[ i + 1 : j ]
                    InternalNode.addChildNode(self.parseNode(childSubString, depth + 1))

                    nodes.append(InternalNode)  # we append the internal node later to preserve order

                    start = j + 1
                continue

        if depth == 0:    # if it's the root node, we do nothing about it and return
            return nodes[0]

        # Adding last most set of children
        endString = string[start:]
        if string[start-1] == ")":  # if the symbol belongs to an internal node which is created previously, then we remove it from the string left to parse
            match = re.search(r"[\)\,\(]", endString)
            if match:
                endOfNodeName = start + match.start() + 1
                endString = string[endOfNodeName:]
                nodes += self._makeNodesFromString(endString, depth)

        return nodes
Exemplo n.º 8
0
class Phyloxml_Parser(Base_Parser):
    """Parses a phyloxml file into a json file that will be passed to PhyloViz for display"""
    def __init__(self):
        super(Phyloxml_Parser, self).__init__()
        self.phyloTree = PhyloTree()
        self.tagsOfInterest = {
            "clade": "",
            "name": "name",
            "branch_length": "length",
            "confidence": "bootstrap",
            "events": "events"
        }

    def parseFile(self, filePath):
        """passes a file and extracts its Phylogeny Tree content."""
        phyloXmlFile = open(filePath, "r")

        xmlTree = ElementTree.parse(phyloXmlFile)
        xmlRoot = xmlTree.getroot()[0]
        self.nameSpaceIndex = xmlRoot.tag.rfind(
            "}"
        ) + 1  # used later by the clean tag method to remove the name space in every element.tag

        phyloRoot = None
        for child in xmlRoot:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                phyloRoot = child
            elif childTag == "name":
                self.phyloTree.title = child.text

        self.phyloTree.root = self.parseNode(phyloRoot, 0)
        jsonDict = self.phyloTree.generateJsonableDict()
        return [jsonDict], "Success"

    def parseNode(self, node, depth):
        """Parses any node within a phyloxml tree and looks out for claude, which signals the creation of
        nodes - internal OR leaf"""

        tag = self.cleanTag(node.tag)
        if not tag == "clade":
            return None
        hasInnerClade = False

        # peeking once for parent and once for child to check if the node is internal
        for child in node:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                hasInnerClade = True
                break

        if hasInnerClade:  # this node is an internal node
            currentNode = self._makeInternalNode(node, depth=depth)
            for child in node:
                child = self.parseNode(child, depth + 1)
                if isinstance(child, Node):
                    currentNode.addChildNode(child)

        else:  # this node is a leaf node
            currentNode = self._makeLeafNode(node, depth=depth + 1)

        return currentNode

    def _makeLeafNode(self, leafNode, depth=0):
        """Makes leaf nodes by calling Phylotree methods"""
        node = {}
        for child in leafNode:
            childTag = self.cleanTag(child.tag)
            if childTag in self.tagsOfInterest:
                key = self.tagsOfInterest[
                    childTag]  # need to map phyloxml terms to ours
                node[key] = child.text

        node["depth"] = depth
        return self.phyloTree.makeNode(self._getNodeName(leafNode), **node)

    def _getNodeName(self, node, depth=-1):
        """Gets the name of a claude. It handles the case where a taxonomy node is involved"""
        def getTagFromTaxonomyNode(node):
            """Returns the name of a taxonomy node. A taxonomy node have to be treated differently as the name
            is embedded one level deeper"""
            phyloxmlTaxoNames = {
                "common_name": "",
                "scientific_name": "",
                "code": ""
            }
            for child in node:
                childTag = self.cleanTag(child.tag)
                if childTag in phyloxmlTaxoNames:
                    return child.text
            return ""

        nodeName = ""
        for child in node:
            childTag = self.cleanTag(child.tag)
            if childTag == "name":
                nodeName = child.text
                break
            elif childTag == "taxonomy":
                nodeName = getTagFromTaxonomyNode(child)
                break

        return nodeName

    def _makeInternalNode(self, internalNode, depth=0):
        """ Makes an internal node from an element object that is guranteed to be a parent node.
        Gets the value of interests like events and appends it to a custom node object that will be passed to PhyloTree to make nodes
        """
        node = {}
        for child in internalNode:
            childTag = self.cleanTag(child.tag)
            if childTag == "clade":
                continue
            elif childTag in self.tagsOfInterest:
                if childTag == "events":  # events is nested 1 more level deeper than others
                    key, text = "events", self.cleanTag(child[0].tag)
                else:
                    key = self.tagsOfInterest[childTag]
                    text = child.text
                node[key] = text

        return self.phyloTree.makeNode(self._getNodeName(internalNode, depth),
                                       **node)

    def cleanTag(self, tagString):
        return tagString[self.nameSpaceIndex:]