Exemplo n.º 1
0
    def testReroot(self):
        tree = NX.DiGraph()
        edges = [(1,2), (1,3), (2,4), (2,5), (5,6), (5,7), (5,8)]
        for edge in edges:
            tree.add_edge(edge[0], edge[1])
        nxTree = NXTree(tree)
        nxTree.setWeight(1, 2, 3.1)
        to = [x for x in nxTree.breadthFirstTraversal()]

        testTree = copy.deepcopy(nxTree)
        testTree.reroot(1)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t == to
        
        testTree = copy.deepcopy(nxTree)
        testTree.reroot(2)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t[0] == 2
        assert sorted(t[1:4]) == [1,4,5]
        assert sorted(t[4:]) == [3,6,7,8]
        assert testTree.getWeight(2, 1) == 3.1

        testTree = copy.deepcopy(nxTree)
        testTree.reroot(7)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t[0] == 7
        assert t[1] == 5
        assert sorted(t[2:5]) == [2,6,8]
        assert sorted(t[5:7]) == [1,4]
        assert t[7] == 3
Exemplo n.º 2
0
    def testReroot(self):
        tree = NX.DiGraph()
        edges = [(1, 2), (1, 3), (2, 4), (2, 5), (5, 6), (5, 7), (5, 8)]
        for edge in edges:
            tree.add_edge(edge[0], edge[1])
        nxTree = NXTree(tree)
        nxTree.setWeight(1, 2, 3.1)
        to = [x for x in nxTree.breadthFirstTraversal()]

        testTree = copy.deepcopy(nxTree)
        testTree.reroot(1)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t == to

        testTree = copy.deepcopy(nxTree)
        testTree.reroot(2)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t[0] == 2
        assert sorted(t[1:4]) == [1, 4, 5]
        assert sorted(t[4:]) == [3, 6, 7, 8]
        assert testTree.getWeight(2, 1) == 3.1

        testTree = copy.deepcopy(nxTree)
        testTree.reroot(7)
        t = [x for x in testTree.breadthFirstTraversal()]
        assert t[0] == 7
        assert t[1] == 5
        assert sorted(t[2:5]) == [2, 6, 8]
        assert sorted(t[5:7]) == [1, 4]
        assert t[7] == 3
Exemplo n.º 3
0
 def parseString(self, newickString, addImpliedRoots=True):
     self.nxTree = NXTree()
     self.inString = self.__filterWhitespace(newickString)
     self.__createBracketTable()
     self.nextId = 0
     assert self.inString[-1] == ';'
     self.__addNode(0, len(self.inString) - 1, None, addImpliedRoots)
     self.nxTree.isTree()
     return self.nxTree
Exemplo n.º 4
0
 def testTraversals(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         dfs = [x for x in nxTree.preOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         dfs = [x for x in nxTree.postOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         bfs = [x for x in nxTree.breadthFirstTraversal()]
         assert len(set(bfs)) == len(nxTree.nxDg.nodes())
Exemplo n.º 5
0
 def testTraversals(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         dfs = [x for x in nxTree.preOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         dfs = [x for x in nxTree.postOrderTraversal()]
         assert len(set(dfs)) == len(nxTree.nxDg.nodes())
         bfs = [x for x in nxTree.breadthFirstTraversal()]
         assert len(set(bfs)) == len(nxTree.nxDg.nodes())
Exemplo n.º 6
0
 def starTree(self):
     self.tree = NXTree()
     label = 0
     self.tree.nxDg.add_node(label)
     self.tree.rootId = label
     for name in self.pathMap.keys():
         label += 1
         self.tree.nxDg.add_edge(0, label)
         self.tree.setName(label, name)
         self.tree.setWeight(0, label, SeqFile.branchLen)
Exemplo n.º 7
0
 def __init__(self, tree=None):
     if isinstance(tree, NXTree):
         NXTree.__init__(self, tree.nxDg)
     else:
         NXTree.__init__(self, tree)
     # ids of all subtree roots for fast checking
     self.subtreeRoots = set()
     # map of names to node ids
     self.nameToId = dict()
     for node in self.breadthFirstTraversal():
         if self.hasName(node):
             self.nameToId[self.getName(node)] = node
Exemplo n.º 8
0
 def __init__(self, tree = None, subtreeSize = 2):
     if isinstance(tree, NXTree):
         NXTree.__init__(self, tree.nxDg)
     else:   
         NXTree.__init__(self, tree)
     # ids of all subtree roots for fast checking
     self.subtreeRoots = set()
     # map of names to node ids
     self.nameToId = dict()
     for node in self.breadthFirstTraversal():
         if self.hasName(node):
             self.nameToId[self.getName(node)] = node
     # size a subtree (in number of leaves)
     self.subtreeSize = subtreeSize
Exemplo n.º 9
0
 def parseString(self, newickString, addImpliedRoots = True):
     self.nxTree = NXTree()
     self.inString = self.__filterWhitespace(newickString)
     self.__createBracketTable()
     self.nextId = 0
     assert self.inString[-1] == ';'
     self.__addNode(0, len(self.inString)-1, None, addImpliedRoots)
     self.nxTree.isTree()
     return self.nxTree
Exemplo n.º 10
0
 def starTree(self):
     self.tree = NXTree()
     label = 0
     self.tree.nxDg.add_node(label)
     self.tree.rootId = label
     for name in self.pathMap.keys():
         label += 1
         self.tree.nxDg.add_edge(0, label)
         self.tree.setName(label, name)
         self.tree.setWeight(0, label, SeqFile.branchLen)
Exemplo n.º 11
0
class NXNewick(object):
    def __init__(self, nxTree=None):
        self.nxTree = None
        self.bracketMatch = None
        self.inString = None
        self.nextId = 0
        self.outString = None

    def parseFile(self, path):
        inFile = open(path)
        self.parseString(inFile.read())
        inFile.close()
        return self.nxTree

    def parseString(self, newickString, addImpliedRoots=True):
        self.nxTree = NXTree()
        self.inString = self.__filterWhitespace(newickString)
        self.__createBracketTable()
        self.nextId = 0
        assert self.inString[-1] == ';'
        self.__addNode(0, len(self.inString) - 1, None, addImpliedRoots)
        self.nxTree.isTree()
        return self.nxTree

    def writeString(self, nxTree=None):
        if nxTree:
            self.nxTree = nxTree
        self.outString = ""
        self.__writeNode(self.nxTree.getRootId(), None)
        self.outString += ";"
        return self.outString

    def writeFile(self, path, nxTree=None):
        outFile = open(path, "w")
        outFile.write(self.writeString(nxTree))
        outFile.write("\n")
        outFile.close()
        return None

    #### PRIVATE WRITING FUNCTIONS ####
    def __writeNode(self, node, parent=None):
        children = self.nxTree.getChildren(node)
        if len(children) > 0:
            self.outString += "("
            for child in children:
                self.__writeNode(child, node)
                if child != children[-1]:
                    self.outString += ","
        if len(children) > 0:
            self.outString += ")"
        name = self.nxTree.getName(node)

        if len(name) > 0:
            containsSpace = True in [c1 in name for c1 in ws]
            if containsSpace:
                self.outString += "\""
            self.outString += name
            if containsSpace:
                self.outString += "\""
        if parent is not None:
            weight = self.nxTree.getWeight(parent, node, defaultValue=None)
            if weight is not None:
                self.outString += ":%s" % str(weight)

    #### PRIVATE READING FUNCTIONS ####

    def __filterWhitespace(self, newickString):
        filteredString = ""
        inQuote = False
        for c in newickString:
            if c == "\'" or c == "\"":
                inQuote = not inQuote
            elif inQuote or c not in ws:
                filteredString += c
        return filteredString

    def __createBracketTable(self):
        bracketStack = []
        self.bracketMatch = dict()
        index = 0
        for c in self.inString:
            if c == '(':
                bracketStack.append(index)
            elif c == ')':
                leftIndex = bracketStack.pop()
                self.bracketMatch[leftIndex] = index
            index += 1
        assert len(bracketStack) == 0

    def __childRanges(self, start, length):
        ranges = []
        currentStart = start
        i = currentStart
        while i < start + length + 1:
            if self.inString[i] == ',' or i == start + length:
                ranges.append((currentStart, i - currentStart))
                currentStart = i + 1
            if i in self.bracketMatch:
                i = self.bracketMatch[i]
            i += 1
        return ranges

    def __parseName(self, nameString):
        if nameString == ';':
            return ('', '')
        tokens = nameString.split(':')
        assert len(tokens) == 1 or len(tokens) == 2
        name = tokens[0]
        weight = ''
        if len(tokens) == 2:
            weight = tokens[1]
        return (name, weight)

    def __addNode(self, start, length, parent=None, addImpliedRoots=True):
        # parse the children (..,...,..)
        children = []
        if self.inString[start] == '(':
            assert start in self.bracketMatch
            chLength = self.bracketMatch[start] - start - 1
            children = self.__childRanges(start + 1, chLength)
            start = self.bracketMatch[start] + 1
            length -= (chLength + 2)

        # prase the name abc:123
        name, weight = self.__parseName(self.inString[start:start + length])
        id = self.nextId
        self.nextId += 1
        self.nxTree.nxDg.add_node(id)
        if len(name) > 0:
            self.nxTree.nxDg.node[id]['name'] = name

        #update the graph
        if parent is not None:
            self.nxTree.nxDg.add_edge(parent, id)
            if len(weight) > 0:
                self.nxTree.nxDg[parent][id]['weight'] = float(weight)

        #update the root (implied roots are added as a new node)
        if self.nxTree.getRootId() is None:
            assert parent is None
            root = id
            if len(weight) > 0:
                if addImpliedRoots:
                    root = self.nextId
                    self.nextId += 1
                    self.nxTree.nxDg.add_edge(root, id)
                    self.nxTree.setWeight(root, id, weight)
            self.nxTree.rootId = root

        # recurse on children
        for child in children:
            self.__addNode(child[0], child[1], id)
Exemplo n.º 12
0
class NXNewick(object):
    def __init__(self, nxTree = None):
        self.nxTree = None
        self.bracketMatch = None
        self.inString = None
        self.nextId = 0
        self.outString = None
        
    def parseFile(self, path):
        inFile = open(path)
        self.parseString(inFile.read())
        inFile.close()
        return self.nxTree
    
    def parseString(self, newickString, addImpliedRoots = True):
        self.nxTree = NXTree()
        self.inString = self.__filterWhitespace(newickString)
        self.__createBracketTable()
        self.nextId = 0
        assert self.inString[-1] == ';'
        self.__addNode(0, len(self.inString)-1, None, addImpliedRoots)
        self.nxTree.isTree()
        return self.nxTree
    
    def writeString(self, nxTree = None):
        if nxTree:
            self.nxTree = nxTree
        self.outString = ""
        self.__writeNode(self.nxTree.getRootId(), None)
        self.outString += ";"
        return self.outString
    
    def writeFile(self, path, nxTree = None):
        outFile = open(path, "w")
        outFile.write(self.writeString(nxTree))
        outFile.write("\n")
        outFile.close()
        return None
 
    #### PRIVATE WRITING FUNCTIONS ####
    def __writeNode(self, node, parent = None):
        children = self.nxTree.getChildren(node)
        if len(children) > 0:
            self.outString += "("
            for child in children:
                self.__writeNode(child, node)
                if child != children[-1]:
                    self.outString += ","
        if len(children) > 0:
            self.outString += ")"
        name = self.nxTree.getName(node)
        
        if len(name) > 0:
            containsSpace = True in [c1 in name for c1 in ws]
            if containsSpace:
                self.outString += "\""
            self.outString += name
            if containsSpace:
                self.outString += "\""
        if parent is not None:
            weight = self.nxTree.getWeight(parent, node, defaultValue=None)
            if weight is not None:
                self.outString += ":%s" % str(weight)      
        
    #### PRIVATE READING FUNCTIONS ####       
    
    def __filterWhitespace(self, newickString):
        filteredString = ""
        inQuote = False
        for c in newickString:
            if c == "\'" or c == "\"":
                inQuote = not inQuote
            elif inQuote or c not in ws:
                filteredString += c
        return filteredString
            
    def __createBracketTable(self):
        bracketStack = []
        self.bracketMatch = dict()
        index = 0
        for c in self.inString:
            if c == '(':
                bracketStack.append(index)
            elif c == ')':
                leftIndex = bracketStack.pop()
                self.bracketMatch[leftIndex] = index
            index += 1
        assert len(bracketStack) == 0
    
    def __childRanges(self, start, length):
        ranges = []
        currentStart = start
        i = currentStart
        while i < start + length + 1:
            if self.inString[i] == ',' or i == start + length:
                ranges.append((currentStart, i - currentStart))
                currentStart = i + 1
            if i in self.bracketMatch:
                i = self.bracketMatch[i]
            i += 1
        return ranges
    
    def __parseName(self, nameString):
        if nameString == ';':
            return ('','')
        tokens = nameString.split(':')
        assert len(tokens) == 1 or len(tokens) == 2
        name = tokens[0]
        weight = ''
        if len(tokens) == 2:
            weight = tokens[1]
        return (name, weight)
        
    def __addNode(self, start, length, parent = None, addImpliedRoots = True):
        # parse the children (..,...,..)
        children = []
        if self.inString[start] == '(':
            assert start in self.bracketMatch
            chLength = self.bracketMatch[start] - start - 1
            children = self.__childRanges(start+1, chLength)      
            start = self.bracketMatch[start] + 1
            length -= (chLength + 2)
            
        # prase the name abc:123
        name, weight = self.__parseName(self.inString[start:start+length])
        id = self.nextId
        self.nextId += 1
        self.nxTree.nxDg.add_node(id)
        if len(name) > 0:
            self.nxTree.nxDg.node[id]['name'] = name
       
        #update the graph
        if parent is not None:
            self.nxTree.nxDg.add_edge(parent, id)
            if len(weight) > 0:
                self.nxTree.nxDg[parent][id]['weight'] = float(weight)
       
        #update the root (implied roots are added as a new node)
        if self.nxTree.getRootId() is None:
            assert parent is None
            root = id
            if len(weight) > 0:
                if addImpliedRoots:
                    root = self.nextId
                    self.nextId += 1
                    self.nxTree.nxDg.add_edge(root, id)
                    self.nxTree.setWeight(root,id, weight)
            self.nxTree.rootId = root
        
        # recurse on children
        for child in children:
            self.__addNode(child[0], child[1], id)
Exemplo n.º 13
0
class SeqFile:
    branchLen = 1
    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)
        
        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)
        
    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    if not os.path.exists:
                        raise RuntimeError("Sequence path not found: %s" % path)

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError("No sequence path specified for any leaves in the tree")
        for leaf in removeList:
             sys.stderr.write("No sequence path found for %s: skipping\n" % (
                 self.tree.getName(leaf)))
             self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" % (
                            self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)
                    

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                path.replace(" ", "\ ")
                seqString += absSymPath(path) + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem
Exemplo n.º 14
0
 def testRoot(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         rootId = nxTree.getRootId()
         assert rootId is not None
         assert nxTree.getParent(rootId) is None
Exemplo n.º 15
0
class SeqFile:
    branchLen = 1
    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)
        
        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)
        
    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    #if not os.path.exists(path):
                    #    raise RuntimeError("Sequence path not found: %s" % path)
                    #self.sanityCheckSequence(path)

    def sanityCheckSequence(self, path):
        """Warns the user about common problems with the input sequences."""
        # Relies on cactus_analyseAssembly output staying in the
        # format it's currently in.
        cmdline = "cactus_analyseAssembly"
        if os.path.isdir(path):
            cmdline = "cat %s/* | %s -" % (path, cmdline)
        else:
            cmdline += " %s" % path
        output = popenCatch(cmdline)
        try:
            repeatMaskedFrac = float(re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1))
            nFrac = float(re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
        except ValueError:
            # This can happen if the genome has 0 length, making the fractions NaN.
            # We warn the user but return afterwards, as the rest of the checks are
            # dependent on the fraction values.
            sys.stderr.write("WARNING: sequence path %s has 0 length. Consider "
                             "removing it from your input file.\n\n" % path)
            return
        # These thresholds are pretty arbitrary, but should be good for
        # badly- to well-assembled vertebrate genomes.
        if repeatMaskedFrac > 0.70:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of masked bases: %f. progressiveCactus"
                             " expects a soft-masked genome, i.e. all lowercase"
                             " characters are considered masked. The process "
                             "will proceed normally, but make sure you haven't "
                             "accidentally provided an all-lowercase genome, "
                             "in which case nothing will be aligned to "
                             "it!\n\n" % (path, repeatMaskedFrac))
        if nFrac > 0.30:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of 'N' bases: %f. The process will "
                             "proceed normally, but make sure your genome "
                             "isn't hard-masked! Alignments to hard-masked "
                             "genomes are much worse than to soft-masked "
                             "genomes. If the genome just has a lot of "
                             "poorly assembled regions, feel free to "
                             "ignore this message.\n\n" % (path, nFrac))

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError("No sequence path specified for any leaves in the tree")
        for leaf in removeList:
             sys.stderr.write("No sequence path found for %s: skipping\n" % (
                 self.tree.getName(leaf)))
             self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" % (
                            self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)
                    

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                seqString += path + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem
Exemplo n.º 16
0
class SeqFile:
    branchLen = 1

    def __init__(self, path=None):
        if path is not None:
            self.parseFile(path)

    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)

        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()

    def starTree(self):
        self.tree = NXTree()
        label = 0
        self.tree.nxDg.add_node(label)
        self.tree.rootId = label
        for name in self.pathMap.keys():
            label += 1
            self.tree.nxDg.add_edge(0, label)
            self.tree.setName(label, name)
            self.tree.setWeight(0, label, SeqFile.branchLen)

    def validate(self):
        if len([i for i in self.tree.postOrderTraversal()]) <= 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    raise RuntimeError("No sequence specified for %s" % name)
                else:
                    path = self.pathMap[name]
                    #if not os.path.exists(path):
                    #    raise RuntimeError("Sequence path not found: %s" % path)
                    #self.sanityCheckSequence(path)

    def sanityCheckSequence(self, path):
        """Warns the user about common problems with the input sequences."""
        # Relies on cactus_analyseAssembly output staying in the
        # format it's currently in.
        cmdline = "cactus_analyseAssembly"
        if os.path.isdir(path):
            cmdline = "cat %s/* | %s -" % (path, cmdline)
        else:
            cmdline += " %s" % path
        output = popenCatch(cmdline)
        try:
            repeatMaskedFrac = float(
                re.search(r'Proportion-repeat-masked: ([0-9.]*)',
                          output).group(1))
            nFrac = float(
                re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
        except ValueError:
            # This can happen if the genome has 0 length, making the fractions NaN.
            # We warn the user but return afterwards, as the rest of the checks are
            # dependent on the fraction values.
            sys.stderr.write(
                "WARNING: sequence path %s has 0 length. Consider "
                "removing it from your input file.\n\n" % path)
            return
        # These thresholds are pretty arbitrary, but should be good for
        # badly- to well-assembled vertebrate genomes.
        if repeatMaskedFrac > 0.70:
            sys.stderr.write(
                "WARNING: sequence path %s has an extremely high "
                "proportion of masked bases: %f. progressiveCactus"
                " expects a soft-masked genome, i.e. all lowercase"
                " characters are considered masked. The process "
                "will proceed normally, but make sure you haven't "
                "accidentally provided an all-lowercase genome, "
                "in which case nothing will be aligned to "
                "it!\n\n" % (path, repeatMaskedFrac))
        if nFrac > 0.30:
            sys.stderr.write("WARNING: sequence path %s has an extremely high "
                             "proportion of 'N' bases: %f. The process will "
                             "proceed normally, but make sure your genome "
                             "isn't hard-masked! Alignments to hard-masked "
                             "genomes are much worse than to soft-masked "
                             "genomes. If the genome just has a lot of "
                             "poorly assembled regions, feel free to "
                             "ignore this message.\n\n" % (path, nFrac))

    # remove leaves that do not have sequence data associated with them
    def cleanTree(self):
        numLeaves = 0
        removeList = []
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                if name not in self.pathMap:
                    removeList.append(node)
                numLeaves += 1
        if numLeaves < 2:
            raise RuntimeError("At least two valid leaf genomes required in"
                               " input tree")
        if len(removeList) == numLeaves:
            raise RuntimeError(
                "No sequence path specified for any leaves in the tree")
        for leaf in removeList:
            sys.stderr.write("No sequence path found for %s: skipping\n" %
                             (self.tree.getName(leaf)))
            self.tree.removeLeaf(leaf)

        for node in self.tree.postOrderTraversal():
            if self.tree.hasParent(node):
                parent = self.tree.getParent(node)
                if self.tree.getWeight(parent, node) is None:
                    sys.stderr.write(
                        "No branch length for %s: setting to %d\n" %
                        (self.tree.getName(node), SeqFile.branchLen))
                    self.tree.setWeight(parent, node, SeqFile.branchLen)

    # create the cactus_workflow_experiment xml element which serves as
    # the root node of the experiment template file needed by
    # cactus_createMultiCactusProject.  Note the element is incomplete
    # until the cactus_disk child element has been added
    def toXMLElement(self):
        assert self.tree is not None
        elem = ET.Element("cactus_workflow_experiment")
        seqString = ""
        for node in self.tree.postOrderTraversal():
            if self.tree.isLeaf(node):
                name = self.tree.getName(node)
                path = self.pathMap[name]
                seqString += path + " "
        elem.attrib["sequences"] = seqString
        elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
        elem.attrib["config"] = "defaultProgressive"
        return elem
Exemplo n.º 17
0
 def testRoot(self):
     for tree in self.trees:
         nxTree = NXTree(tree)
         rootId = nxTree.getRootId()
         assert rootId is not None
         assert nxTree.getParent(rootId) is None