Пример #1
0
def getCactusWorkflowExperimentForTest(sequences,
                                       newickTreeString,
                                       outputDir,
                                       configFile=None,
                                       constraints=None,
                                       progressive=False,
                                       reconstruct=True):
    """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates
    any globally set database conf.
    """
    halFile = os.path.join(outputDir, "test.hal")
    fastaFile = os.path.join(outputDir, "test.fa")
    databaseConf = ET.fromstring(
        _GLOBAL_DATABASE_CONF_STRING
    ) if _GLOBAL_DATABASE_CONF_STRING is not None else None
    tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False)
    genomes = [
        tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id)
    ]
    exp = ExperimentWrapper.createExperimentWrapper(newickTreeString,
                                                    genomes,
                                                    outputDir,
                                                    databaseConf=databaseConf,
                                                    configFile=configFile,
                                                    halFile=halFile,
                                                    fastaFile=fastaFile,
                                                    constraints=constraints,
                                                    progressive=progressive)
    for genome, sequence in zip(genomes, sequences):
        print((genome, sequence))
        exp.setSequenceID(genome, sequence)
    exp.setRootGenome("reference")
    if reconstruct:
        exp.setRootReconstructed(True)
    return exp
Пример #2
0
 def testSanity(self):
     parser = NXNewick()
     mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False))
     tree1String = NXNewick().writeString(mcTree1)
     self.assertEqual(tree1String, self.tree1)
     mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3)
     tree2String = NXNewick().writeString(mcTree2)
     self.assertEqual(tree2String, self.tree2)
Пример #3
0
def main():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('hal', help='hal file')
    parser.add_argument('refGenome', help='reference genome')
    parser.add_argument('halTreeMutationsDir',
                        help='the directory output by halTreeMutations.py')
    parser.add_argument(
        '--targets',
        help='target genomes (comma-separated), default: all leaves')
    parser.add_argument('outputDir',
                        help='output directory for reference beds')
    opts = parser.parse_args()

    # Get the species tree from the hal file.
    newickTree = popenCatch('halStats --tree %s' % (opts.hal))
    tree = NXNewick().parseString(newickTree)

    # Set the target genomes to be all leaves (minus the reference) if not otherwise directed.
    leafGenomes = [tree.getName(x) for x in tree.getLeaves()]
    if opts.refGenome not in leafGenomes:
        raise ValueError("Reference genome %s is not a leaf genome." %
                         opts.refGenome)
    if opts.targets is None:
        opts.targets = [x for x in leafGenomes if x != opts.refGenome]
    else:
        opts.targets = opts.targets.split(',')
        if not all([x in leafGenomes for x in opts.targets]):
            raise ValueError("Some target genomes are not leaves.")

    try:
        os.makedirs(opts.outputDir)
    except:
        if not os.path.isdir(opts.outputDir):
            raise

    for target in opts.targets:
        refID = getTreeID(tree, opts.refGenome)
        targetID = getTreeID(tree, target)
        mrca = getMRCA(tree, refID, targetID)
        pathToTarget = getPath(opts.hal, opts.refGenome, target)
        pathUp, pathDown = [
            list(v) for k, v in groupby(
                pathToTarget, lambda x: x == tree.getName(mrca)) if k != True
        ]
        bedForTarget = os.path.join(opts.outputDir, target + '.bed')
        # First, walk up the tree to the MRCA.
        for curGenome in pathUp:
            liftMutations(opts.halTreeMutationsDir,
                          opts.hal,
                          curGenome,
                          opts.refGenome,
                          bedForTarget,
                          reversePolarity=True)
        # Next, walk down the tree to the target.
        for curGenome in pathDown:
            liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome,
                          opts.refGenome, bedForTarget)
Пример #4
0
 def __generateTrees(self):
     self.tree1 = '((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);'
     self.tree2 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);'
     parser = NXNewick()
     self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False))
     self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False))
     self.mcTree1.nameUnlabeledInternalNodes()
     self.mcTree2.nameUnlabeledInternalNodes()
     self.mcTree1.computeSubtreeRoots()
     self.mcTree2.computeSubtreeRoots()
Пример #5
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.trees = randomTreeSet()
        self.mcTrees = []
        self.tempDir = getTempDirectory(os.getcwd())
        self.tempFa = os.path.join(self.tempDir, "seq.fa")
        with open(self.tempFa, "w") as f:
            f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n")
        self.dummySeqMaps = []
        for tree in self.trees:
            if tree.size() < 50:
                mcTree = MultiCactusTree(tree)
                seqMap = dict()
                for i in mcTree.breadthFirstTraversal():
                    mcTree.setName(i, "Node%s" % str(i))
                    seqMap["Node%s" % str(i)] = self.tempFa
                mcTree.computeSubtreeRoots()
                mcTree.nameUnlabeledInternalNodes()
                self.mcTrees.append(mcTree)
                self.dummySeqMaps.append(seqMap)

        # Boreoeutherian tree
        borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;'
        self.borMcTree = MultiCactusTree(NXNewick().parseString(
            borTree, addImpliedRoots=False))
        self.borMcTree.computeSubtreeRoots()
        self.borMcTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.borMcTree)

        # Eutherian backbone tree
        backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);'
        self.backboneTree = MultiCactusTree(NXNewick().parseString(
            backbone, addImpliedRoots=False))
        self.backboneTree.computeSubtreeRoots()
        self.backboneTree.nameUnlabeledInternalNodes()
        self.mcTrees.append(self.backboneTree)

        seqLens = dict()
        seqLens["HUMAN"] = 57553
        seqLens["CHIMP"] = 57344
        seqLens["BABOON"] = 58960
        seqLens["MOUSE"] = 32750
        seqLens["RAT"] = 38436
        seqLens["DOG"] = 54187
        seqLens["CAT"] = 50283
        seqLens["PIG"] = 54843
        seqLens["COW"] = 55508
        self.blanchetteSeqMap = dict()
        for event, seqLen in seqLens.items():
            p = os.path.join(self.tempDir, event + ".fa")
            with open(p, "w") as f:
                f.write(">%s\n" % event)
                f.write(''.join(['A'] * seqLen))
                f.write('\n')
            self.blanchetteSeqMap[event] = p
Пример #6
0
 def __generateTrees(self):
     self.tree1 = "((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);"
     self.tree2 = "((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);"
     parser = NXNewick()
     self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False))
     self.mcTree1a = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False), subtreeSize=4)
     self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3)
     self.mcTree1.nameUnlabeledInternalNodes()
     self.mcTree1a.nameUnlabeledInternalNodes()
     self.mcTree2.nameUnlabeledInternalNodes()
     self.mcTree1.computeSubtreeRoots()
     self.mcTree1a.computeSubtreeRoots()
     self.mcTree2.computeSubtreeRoots()
Пример #7
0
    def testNewickIO(self):
        # feslenstein's own... (http://evolution.genetics.washington.edu/phylip/newicktree.html)
        tree1 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997, seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201, weasel:18.87953):2.0946):3.87382,dog:25.46154);'
        tree2 = '(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.2146):0.1;'
        tree3 = '(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.2146);'
        tree4 = 'A;'
        tree5 = '((A,B):0.0,(C,D));'
        tree6 = '(Alpha,Beta,Gamma,Delta,,Epsilon,,,);'

        trees = [tree1, tree2, tree3, tree4, tree5, tree6]
        newickParser = NXNewick()

        # Parse newicks, adding implied roots
        for tree in trees:
            newickParser.parseString(tree, addImpliedRoots=True)
            answer = self.__cleanTree(tree)
            outputString = newickParser.writeString()
            logger.debug(" ***************** ")
            logger.debug(outputString)
            logger.debug(answer)
            assert outputString == answer

        # Parse newicks, not adding implied roots
        for tree in trees:
            newickParser.parseString(tree, addImpliedRoots=False)
            outputString = newickParser.writeString()
            answer = re.sub(r':[.0-9]+?;', ';', tree)
            answer = re.sub(r'\s+', '', answer)
            logger.debug(" ***************** ")
            logger.debug(outputString)
            logger.debug(answer)
            assert outputString == answer
def cleanEventTree(experiment):
    tree = MultiCactusTree(experiment.getTree())
    tree.nameUnlabeledInternalNodes()
    for node in tree.breadthFirstTraversal():
        if tree.hasName(node):
            name = tree.getName(node)
            if '.' in name:
                newName = name.replace('.', '_')
                sys.stderr.write('WARNING renaming event %s to %s\n' %(name, newName))
                tree.setName(node, newName)
                name = newName
            parent = tree.getParent(node)
            if parent is not None:
                weight = tree.getWeight(parent, node)
                if weight is None:
                    raise RuntimeError('Missing branch length in species_tree tree')
    redoPrefix = True
    newSuffix = 0
    while redoPrefix is True:
        redoPrefix = False
        for node1 in tree.breadthFirstTraversal():
            name1 = tree.getName(node1)
            for node2 in tree.breadthFirstTraversal():
                name2 = tree.getName(node2)
                if node1 != node2 and name1 == name2:
                    newName = "%s%i" % (name2, newSuffix)
                    newSuffix += 1
                    tree.setName(node2, newName)
                    sys.stderr.write('WARNING renaming event %s to %s\n' % (
                        name2, newName))
                    redoPrefix = True

    experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree)
    experiment.seqMap = experiment.buildSequenceMap()
Пример #9
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.tree = NXNewick().parseString(
         '((((HUMAN:0.006969,CHIMP:0.009727)anc2:0.025291,BABOON:0.044568)anc1:0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);'
     )
     self.xmlRoot = self.__makeXmlDummy()
     self.exp = ExperimentWrapper(self.xmlRoot)
     self.exp.setTree(self.tree)
     self.seqMap = {
         'HUMAN': 'human.txt',
         'CHIMP': 'chimp.txt',
         'BABOON': 'baboon.txt',
         'MOUSE': 'mouse.txt',
         'RAT': 'rat.txt',
         'DOG': 'dog.txt',
         'CAT': 'cat.txt',
         'PIG': 'pig.txt',
         'COW': 'cow.txt'
     }
     self.exp.setRootGenome('anc1')
     self.exp.setRootReconstructed(True)
     self.exp.setOutgroupGenomes(
         ['MOUSE', 'RAT', 'DOG', 'CAT', 'PIG', 'COW'])
     for genome, seq in self.seqMap.items():
         # These aren't real IDs, but should still work for our
         # purposes
         self.exp.setSequenceID(genome, seq)
Пример #10
0
    def readXML(self, path):
        xmlRoot = ET.parse(path).getroot()
        treeElem = xmlRoot.find("tree")
        self.mcTree = MultiCactusTree(NXNewick().parseString(
            treeElem.text, addImpliedRoots=False))
        self.expMap = dict()
        self.expIDMap = dict()
        cactusPathElemList = xmlRoot.findall("cactus")
        for cactusPathElem in cactusPathElemList:
            nameElem = cactusPathElem.attrib["name"]
            pathElem = cactusPathElem.attrib["experiment_path"]
            self.expMap[nameElem] = pathElem
            if "experiment_id" in cactusPathElem.attrib:
                self.expIDMap[nameElem] = cactusPathElem.attrib[
                    "experiment_id"]
        self.inputSequenceMap = dict(
            zip(xmlRoot.attrib["inputSequenceNames"].split(),
                xmlRoot.attrib["inputSequences"].split()))
        if "inputSequenceIDs" in xmlRoot.attrib:
            self.inputSequenceIDMap = dict(
                zip(xmlRoot.attrib["inputSequenceIDNames"].split(),
                    xmlRoot.attrib["inputSequenceIDs"].split()))
        if "outputSequenceIDs" in xmlRoot.attrib:
            self.outputSequenceIDMap = dict(
                zip(xmlRoot.attrib["outputSequenceNames"].split(),
                    xmlRoot.attrib["outputSequenceIDs"].split()))

        logger.info("xmlRoot = %s" % ET.tostring(xmlRoot))
        if "configID" in xmlRoot.attrib:
            self.configID = xmlRoot.attrib["configID"]

        self.mcTree.assignSubtreeRootNames(self.expMap)
Пример #11
0
 def progressiveFunction(self,
                         experimentFile,
                         toilDir,
                         batchSystem,
                         buildAvgs,
                         buildHal,
                         buildFasta,
                         toilStats,
                         subtreeRoot=None,
                         logLevel=None):
     eW = ExperimentWrapper(ET.parse(experimentFile).getroot())
     seqFile = getTempFile()
     with open(seqFile, 'w') as f:
         tree = eW.getTree()
         newick = NXNewick().writeString(tree)
         f.write('%s\n' % newick)
         for genome in eW.getGenomesWithSequence():
             f.write('%s %s\n' % (genome, eW.getSequenceID(genome)))
     config = eW.getConfigPath()
     runCactusProgressive(seqFile,
                          config,
                          toilDir,
                          batchSystem=batchSystem,
                          buildAvgs=buildAvgs,
                          toilStats=toilStats,
                          logLevel=logLevel)
Пример #12
0
    def writeXML(self, path):
        xmlRoot = ET.Element("multi_cactus")
        treeElem = ET.Element("tree")
        treeElem.text = NXNewick().writeString(self.mcTree)
        xmlRoot.append(treeElem)
        for name, expPath in self.expMap.items():
            cactusPathElem = ET.Element("cactus")
            cactusPathElem.attrib["name"] = name
            cactusPathElem.attrib["experiment_path"] = expPath
            if self.expIDMap:
                cactusPathElem.attrib["experiment_id"] = self.expIDMap[name]
            xmlRoot.append(cactusPathElem)
        #We keep track of all the input sequences at the top level
        xmlRoot.attrib["inputSequences"] = " ".join(
            self.inputSequenceMap.values())
        xmlRoot.attrib["inputSequenceNames"] = " ".join(
            self.inputSequenceMap.keys())
        if self.inputSequenceIDMap:
            xmlRoot.attrib["inputSequenceIDs"] = " ".join(
                self.inputSequenceIDMap.values())
            xmlRoot.attrib["inputSequenceIDNames"] = " ".join(
                self.inputSequenceIDMap.keys())
        if self.outputSequenceIDMap:
            xmlRoot.attrib["outputSequenceIDs"] = " ".join(
                self.outputSequenceIDMap.values())
            xmlRoot.attrib["outputSequenceNames"] = " ".join(
                self.outputSequenceIDMap.keys())
        if self.configID:
            xmlRoot.attrib["configID"] = self.configID

        xmlFile = open(path, "w")
        xmlString = ET.tostring(xmlRoot)
        xmlString = minidom.parseString(xmlString).toprettyxml()
        xmlFile.write(xmlString)
        xmlFile.close()
def createFileStructure(mcProj, expTemplate, configTemplate, options):
    if not os.path.exists(options.path):
        os.makedirs(options.path)
    mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name))

    for name, expPath in list(mcProj.expMap.items()):
        path = os.path.join(options.path, name)
        children = mcProj.entireTree.getChildNames(name)

        # Get outgroups
        outgroups = []
        if configTemplate.getOutgroupStrategy() != 'none' \
        and name in mcProj.outgroup.ogMap:
            # Outgroup name is the first element of the ogMap tuples
            outgroups.extend(list(map(itemgetter(0), mcProj.outgroup.ogMap[name])))

        subtree = mcProj.entireTree.extractSpanningTree(children + [name] + outgroups)
        exp = ExperimentWrapper.createExperimentWrapper(NXNewick().writeString(subtree),
                                                        children + [name] + outgroups,
                                                        databaseConf=expTemplate.confElem)

        exp.setRootGenome(name)
        exp.setOutgroupGenomes(outgroups)

        if not os.path.exists(path):
            os.makedirs(path)
        config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot))
        if expTemplate.getSequenceID(name):
            exp.setRootReconstructed(False)
            exp.setSequenceID(name, expTemplate.getSequenceID(name))
        else:
            exp.setRootReconstructed(True)
        exp.writeXML(expPath)
Пример #14
0
    def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir,
                                           batchSystem, buildAvgs,
                                           buildReference, buildHal,
                                           buildFasta, toilStats):
        """Choose an arbitrary subtree from the larger species tree to run the
        alignment on. This function is necessary to keep
        runWorkflow_multipleExamples general (specifying a subtree
        root doesn't make sense for runCactusWorkflow).
        """
        # Get valid internal nodes that are the root of the subtree we
        # want to align
        expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot())
        tree = expWrapper.getTree()
        validNodes = []
        for node in tree.postOrderTraversal():
            if tree.hasName(node) and not tree.isLeaf(node):
                validNodes.append(tree.getName(node))

        # Choose a random valid subtree root (NB: the entire species
        # tree is a valid subtree)
        subtreeRoot = random.choice(validNodes)
        logger.info("Chose subtree root %s to test from species tree "
                    "%s" % (subtreeRoot, NXNewick().writeString(tree)))

        self.progressiveFunction(experimentFile, toilDir, batchSystem,
                                 buildAvgs, buildReference, buildHal,
                                 buildFasta, toilStats, subtreeRoot)
Пример #15
0
    def testExtractSpanningTree(self):
        """Tests whether extracting a binary spanning tree works correctly."""
        prevNewick1 = NXNewick().writeString(self.mcTree1)
        # Check a dead-simple spanning tree with 3 closely related leaves.
        spanHCB = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "BABOON"])
        # Check that the existing tree hasn't been modified (OK, a bit
        # silly, but just in case).
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        # Check the actual spanning tree.
        self.assertEqual(NXNewick().writeString(spanHCB), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;")

        # Now test a more complicated tree, where we should remove as
        # many of the ancestors as possible (they will add extra
        # losses for no reason!).
        spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"])
        self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1)
        self.assertEqual(NXNewick().writeString(spanHCC), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;")
Пример #16
0
 def testSetTree(self):
     # A modfied version, with fewer genomes and a new one
     tree2 = NXNewick().parseString(
         '((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568,ARMADILLO:1.0);'
     )
     self.exp.setTree(tree2)
     self.assertEqual(set(self.exp.getGenomesWithSequence()),
                      set(['HUMAN', 'CHIMP', 'BABOON']))
Пример #17
0
 def testAddSelf(self):
     trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;'
     tree = MultiCactusTree(self.mcTree1)
     tree.nameUnlabeledInternalNodes()
     tree.computeSubtreeRoots()
     tree.addSelfEdges()
     treeString = NXNewick().writeString(tree)
     self.assertEqual(treeString, trueSelf)
Пример #18
0
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=True,
              checkpointInfo=None):

    HALPath = "tmp_alignment.hal"

    # traverse tree to make sure we are going breadth-first
    tree = project.mcTree

    # find subtree if event specified
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in project.expMap:
            experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName])
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupGenomes()
            experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID()))
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHalID() is not None
            assert experiment.getHalFastaID() is not None
            subHALPath = job.fileStore.readGlobalFile(experiment.getHalID())
            halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID())

            args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)]

            if len(outgroups) > 0:
                args += ["--outgroups", ",".join(outgroups)]
            if cacheBytes is not None:
                args += ["--cacheBytes", cacheBytes]
            if cacheMDC is not None:
                args += ["--cacheMDC", cacheMDC]
            if cacheRDC is not None:
                args += ["--cacheRDC", cacheRDC]
            if cacheW0 is not None:
                args += ["--cacheW0", cacheW0]
            if chunk is not None:
                args += ["--chunk", chunk]
            if deflate is not None:
                args += ["--deflate", deflate]
            if inMemory is True:
                args += ["--inMemory"]

            cactus_call(parameters=["halAppendCactusSubtree"] + args)

    cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit])
    with job.fileStore.readGlobalFileStream(project.configID) as configFile:
        cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read()).decode()])

    if checkpointInfo:
        write_s3(HALPath, checkpointInfo[1], region=checkpointInfo[0])

    return job.fileStore.writeGlobalFile(HALPath)
Пример #19
0
 def testNewickIO(self):
     # feslenstein's own... (http://evolution.genetics.washington.edu/phylip/newicktree.html)
     tree1 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997, seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201, weasel:18.87953):2.0946):3.87382,dog:25.46154);'
     tree2 = '(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.2146):0.1;'
     tree3 = '(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.2146);'
     tree4 = 'A;'
     tree5 = '((A,B):0.0,(C,D));'
     tree6 = '(Alpha,Beta,Gamma,Delta,,Epsilon,,,);'
     
     trees = [tree1, tree2, tree3, tree4, tree5, tree6]        
     newickParser = NXNewick()
     
     # Parse newicks, adding implied roots
     for tree in trees:
         newickParser.parseString(tree, addImpliedRoots=True)
         answer = self.__cleanTree(tree)
         outputString = newickParser.writeString()
         logger.debug(" ***************** ")
         logger.debug(outputString)
         logger.debug(answer)
         assert outputString == answer
 
     # Parse newicks, not adding implied roots
     for tree in trees:
         newickParser.parseString(tree, addImpliedRoots=False)
         outputString = newickParser.writeString()
         answer = re.sub(r':[.0-9]+?;', ';', tree)
         answer = re.sub(r'\s+', '', answer)
         logger.debug(" ***************** ")
         logger.debug(outputString)
         logger.debug(answer)
         assert outputString == answer
Пример #20
0
    def testSequenceMap(self):
        xmlRoot = self.__makeXmlDummy(self.tree, self.sequences)
        exp = ExperimentWrapper(xmlRoot)
        assert NXNewick().writeString(exp.getTree()) == self.tree

        seqMap = exp.buildSequenceMap()
        seqList = self.sequences.split()
        for i in seqList:
            assert seqMap[os.path.splitext(i)[0].upper()] == i
Пример #21
0
    def testAddOutgroup(self):
        trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;'
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots = False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)
Пример #22
0
    def testAddOutgroup(self):
        trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;"
        tree = MultiCactusTree(self.mcTree1)
        tree.nameUnlabeledInternalNodes()
        tree.computeSubtreeRoots()
        tree.addOutgroup("outgroup", 1.7)
        treeString = NXNewick().writeString(tree)
        self.assertEqual(treeString, trueOg)

        trueLeafOg = "(A:1.1,outgroup:1.1);"
        leafTreeString = "A;"
        parser = NXNewick()
        leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False))
        leafTree.nameUnlabeledInternalNodes()
        leafTree.computeSubtreeRoots()
        leafTree.addOutgroup("outgroup", 2.2)
        leafTreeOutString = NXNewick().writeString(leafTree)
        self.assertEqual(leafTreeOutString, trueLeafOg)
Пример #23
0
    def parseFile(self, path):
        if not os.path.isfile(path):
            raise RuntimeError("File not found: %s" % path)
        self.tree = None
        self.pathMap = dict()
        self.outgroups = []
        seqFile = open(path, "r")
        for l in seqFile:
            line = l.strip()
            if line:
                if line[0] == "#":
                    continue
                tokens = line.split()
                if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                    newickParser = NXNewick()
                    if not line.strip().endswith(");"):
                        raise RuntimeError("The newick tree %s may not "
                                           "have a branch length after "
                                           "the root node." % line)
                    try:
                        self.tree = newickParser.parseString(line)
                    except:
                        raise RuntimeError("Failed to parse newick tree: %s" %
                                           line)
                elif len(tokens) > 0 and tokens[0] == '*':
                    sys.stderr.write("Skipping line %s\n" % l)
                elif line[0] != '(' and len(tokens) >= 2:
                    name = tokens[0]
                    if name[0] == '*':
                        name = name[1:]
                        self.outgroups.append(name)
                    path = string.join(tokens[1:])
                    if name in self.pathMap:
                        raise RuntimeError("Duplicate name found: %s" % name)
                    self.pathMap[name] = path
                elif len(tokens) > 0:
                    sys.stderr.write("Skipping line %s\n" % l)

        if self.tree is None:
            self.starTree()
        self.cleanTree()
        self.validate()
Пример #24
0
 def parseFile(self, path):
     if not os.path.isfile(path):
         raise RuntimeError("File not found: %s" % path)
     self.tree = None
     self.pathMap = dict()
     self.outgroups = []
     seqFile = open(path, "r")
     for l in seqFile:
         line = l.strip()
         if line:
             if line[0] == "#":
                 continue
             tokens = line.split()
             if self.tree is None and (len(tokens) == 1 or line[0] == '('):
                 newickParser = NXNewick()
                 if not line.strip().endswith(");"):
                     raise RuntimeError("The newick tree %s may not "
                                        "have a branch length after "
                                        "the root node." % line)
                 try:
                     self.tree = newickParser.parseString(line)
                 except:
                     raise RuntimeError("Failed to parse newick tree: %s" %
                                        line)
             elif len(tokens) > 0 and tokens[0] == '*':
                 sys.stderr.write("Skipping line %s\n" % l)
             elif line[0] != '(' and len(tokens) >= 2:
                 name = tokens[0]
                 if name[0] == '*':
                     name = name[1:]
                     self.outgroups.append(name)
                 path = string.join(tokens[1:])
                 if name in self.pathMap:
                     raise RuntimeError("Duplicate name found: %s" % name)
                 self.pathMap[name] = path
             elif len(tokens) > 0:
                 sys.stderr.write("Skipping line %s\n" % l)
     
     if self.tree is None:
         self.starTree()
     self.cleanTree()
     self.validate()
Пример #25
0
    def __str__(self):
        og_set = set(self.outgroups)

        s = NXNewick().writeString(self.tree)
        s += '\n'
        for name, path in self.pathMap.items():
            if name in og_set:
                s += '*'
            s += '{}\t{}\n'.format(name, path)

        return s
Пример #26
0
 def getTree(self, onlyThisSubtree=False):
     treeString = self.xmlRoot.attrib["species_tree"]
     ret = NXNewick().parseString(treeString, addImpliedRoots=False)
     if onlyThisSubtree:
         # Get a subtree containing only the reference node and its
         # children, rather than a species tree including the
         # outgroups as well
         multiCactus = MultiCactusTree(ret)
         multiCactus.nameUnlabeledInternalNodes()
         multiCactus.computeSubtreeRoots()
         ret = multiCactus.extractSubTree(self.getRootGenome())
     return ret
Пример #27
0
 def toXMLElement(self):
     assert self.tree is not None
     elem = ET.Element("cactus_workflow_experiment")
     seqString = ""
     for node in self.tree.postOrderTraversal():
         if self.tree.isLeaf(node):
             name = self.tree.getName(node)
             path = self.pathMap[name]
             seqString += path + " "
     elem.attrib["sequences"] = seqString
     elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
     elem.attrib["config"] = "defaultProgressive"
     return elem
Пример #28
0
 def toXMLElement(self):
     assert self.tree is not None
     elem = ET.Element("cactus_workflow_experiment")
     for node in self.tree.postOrderTraversal():
         name = self.tree.getName(node)
         if name in self.pathMap:
             path = self.pathMap[name]
             genomeNode = ET.SubElement(elem, "genome")
             genomeNode.attrib['name'] = name
             genomeNode.attrib['sequence'] = path
     elem.attrib["species_tree"] = NXNewick().writeString(self.tree)
     elem.attrib["config"] = "defaultProgressive"
     return elem
Пример #29
0
 def testSanity(self):
     parser = NXNewick()
     mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False))
     tree1String = NXNewick().writeString(mcTree1)
     self.assertEqual(tree1String, self.tree1)
     mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False))
     tree2String = NXNewick().writeString(mcTree2)
     self.assertEqual(tree2String, self.tree2)
Пример #30
0
    def testSubtrees(self):
        roots1 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"]
        roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5"]

        subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;'
        subTree2_a5 = '(monkey:100.8593,cat:47.14069)Anc5;'

        trueRoots = [roots1, roots2]
        trueSubtrees = [subTree1_a3, subTree2_a5]
        trees = [self.mcTree1, self.mcTree2]
        ancs = ["Anc3", "Anc5"]

        for tree, trueRoot, anc, trueSubtree in zip(trees, trueRoots, ancs, trueSubtrees):
            roots = tree.getSubtreeRootNames()
            self.assertEqual(sorted(roots), sorted(trueRoot))
            subtree = tree.extractSubTree(anc)
            subtree = NXNewick().writeString(subtree)
            self.assertEqual(subtree, trueSubtree)
Пример #31
0
    def updateTree(self, tree, seqMap = None, outgroups = None):
        if seqMap is not None:
            self.seqMap = seqMap
        newMap = dict()
        treeString = NXNewick().writeString(tree)
        self.xmlRoot.attrib["species_tree"] = treeString
        if outgroups is not None and len(outgroups) > 0:
            self.setOutgroupEvents(outgroups)

        sequences = ""
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node) or tree.getName(node) in self.getOutgroupEvents():
                nodeName = tree.getName(node)
                if len(sequences) > 0:
                    sequences += " "
                sequences += seqMap[nodeName]
                newMap[nodeName] = seqMap[nodeName]
        self.xmlRoot.attrib["sequences"] = sequences
        self.seqMap = newMap
Пример #32
0
 def run(self):
     # Find all ancestral genomes using the tree.
     newickStr = popenCatch("halStats --tree %s" % self.halFile)
     tree = NXNewick().parseString(newickStr)
     bedFiles = {} # genome => bed files of inserted columns
     for nodeId in tree.postOrderTraversal():
         if len(tree.getChildren(nodeId)) == 0:
             # leaf node, skip
             continue
         assert tree.hasName(nodeId)
         genome = tree.getName(nodeId)
         bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir())
         bedFiles[genome] = bedFileForGenome
         self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome))
     self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
Пример #33
0
    def testSubtrees(self):
        roots1 = [
            "Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"
        ]
        roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"]
        roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"]

        subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;'
        subTree1a_a0 = '((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;'
        subTree2_a3 = '(monkey:100.8593,cat:47.14069)Anc5;'

        trueRoots = [roots1, roots1a, roots2]
        trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3]
        trees = [self.mcTree1, self.mcTree1a, self.mcTree2]
        ancs = ["Anc3", "Anc0", "Anc5"]

        for i in range(0, 3):
            roots = trees[i].getSubtreeRootNames()
            self.assertEqual(sorted(roots), sorted(trueRoots[i]))
            subtree = trees[i].extractSubTree(ancs[i])
            subtree = NXNewick().writeString(subtree)
            self.assertEqual(subtree, trueSubtrees[i])
Пример #34
0
    def setTree(self, tree):
        """
        Load a new tree.
        """
        # Write the new string to the XML
        treeString = NXNewick().writeString(tree)
        self.xmlRoot.attrib["species_tree"] = treeString

        # Ensure the changes are reflected in the genome elements
        # (adding and deleting elements as necessary).
        genomesInTree = set(
            tree.getName(id) for id in tree.postOrderTraversal()
            if tree.hasName(id))
        genomeNodes = self.xmlRoot.findall('genome')
        genomeNamesInXML = set(node.attrib['name'] for node in genomeNodes)
        for node in genomeNodes:
            if node.attrib['name'] not in genomesInTree:
                self.xmlRoot.remove(node)

        for genome in genomesInTree:
            if genome not in genomeNamesInXML:
                node = ET.SubElement(self.xmlRoot, 'genome')
                node.attrib['name'] = genome
Пример #35
0
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    if not args['append']:
        # Overwrite existing hal
        print 'rm -f {0}'.format(args['HAL_file_path'])
        system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            experiment = ExperimentWrapper(
                ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree())
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(
                experiment.getHALPath(), experiment.getHALFastaPath(),
                expTreeString, args['HAL_file_path'])

            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            if args["inMemory"] is True:
                cmdline += " --inMemory"

            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime


#            print "time of above command: {0:.2f}".format(appendTime)

    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(
        totalTime, totalAppendTime)
from sonLib.bioio import fastaRead
from sonLib.nxnewick import NXNewick

def lengthWithoutGaps(seq):
    return len([i for i in seq if i != '-'])

if __name__ == '__main__':
    # Parse args
    if len(sys.argv) < 3:
        print __doc__
        sys.exit(1)

    newickPath = sys.argv[1]
    fastaPath = sys.argv[2]
    treeString = open(newickPath).read().split("\n")[0].strip()
    tree = NXNewick().parseString(treeString)
    
    sequences = {}
    for name, seq in fastaRead(open(fastaPath)):
        sequences[name] = seq
    
    # Print MAF, with sequence lines in post-order.
    print '##maf version=1 scoring=NA'
    print 'a tree="%s"' % (treeString)
    for nodeId in tree.postOrderTraversal():
        if not tree.isLeaf(nodeId):
            continue
        nodeName = tree.getName(nodeId)
        if nodeName not in sequences:
            raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName))
        seq = sequences[nodeName]