def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False, reconstruct=True): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") databaseConf = ET.fromstring( _GLOBAL_DATABASE_CONF_STRING ) if _GLOBAL_DATABASE_CONF_STRING is not None else None tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False) genomes = [ tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id) ] exp = ExperimentWrapper.createExperimentWrapper(newickTreeString, genomes, outputDir, databaseConf=databaseConf, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive) for genome, sequence in zip(genomes, sequences): print((genome, sequence)) exp.setSequenceID(genome, sequence) exp.setRootGenome("reference") if reconstruct: exp.setRootReconstructed(True) return exp
def testSanity(self): parser = NXNewick() mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False)) tree1String = NXNewick().writeString(mcTree1) self.assertEqual(tree1String, self.tree1) mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) tree2String = NXNewick().writeString(mcTree2) self.assertEqual(tree2String, self.tree2)
def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('hal', help='hal file') parser.add_argument('refGenome', help='reference genome') parser.add_argument('halTreeMutationsDir', help='the directory output by halTreeMutations.py') parser.add_argument( '--targets', help='target genomes (comma-separated), default: all leaves') parser.add_argument('outputDir', help='output directory for reference beds') opts = parser.parse_args() # Get the species tree from the hal file. newickTree = popenCatch('halStats --tree %s' % (opts.hal)) tree = NXNewick().parseString(newickTree) # Set the target genomes to be all leaves (minus the reference) if not otherwise directed. leafGenomes = [tree.getName(x) for x in tree.getLeaves()] if opts.refGenome not in leafGenomes: raise ValueError("Reference genome %s is not a leaf genome." % opts.refGenome) if opts.targets is None: opts.targets = [x for x in leafGenomes if x != opts.refGenome] else: opts.targets = opts.targets.split(',') if not all([x in leafGenomes for x in opts.targets]): raise ValueError("Some target genomes are not leaves.") try: os.makedirs(opts.outputDir) except: if not os.path.isdir(opts.outputDir): raise for target in opts.targets: refID = getTreeID(tree, opts.refGenome) targetID = getTreeID(tree, target) mrca = getMRCA(tree, refID, targetID) pathToTarget = getPath(opts.hal, opts.refGenome, target) pathUp, pathDown = [ list(v) for k, v in groupby( pathToTarget, lambda x: x == tree.getName(mrca)) if k != True ] bedForTarget = os.path.join(opts.outputDir, target + '.bed') # First, walk up the tree to the MRCA. for curGenome in pathUp: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget, reversePolarity=True) # Next, walk down the tree to the target. for curGenome in pathDown: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget)
def __generateTrees(self): self.tree1 = '((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' self.tree2 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);' parser = NXNewick() self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False)) self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False)) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString( borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString( backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event + ".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
def __generateTrees(self): self.tree1 = "((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);" self.tree2 = "((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);" parser = NXNewick() self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False)) self.mcTree1a = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False), subtreeSize=4) self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree1a.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree1a.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def testNewickIO(self): # feslenstein's own... (http://evolution.genetics.washington.edu/phylip/newicktree.html) tree1 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997, seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201, weasel:18.87953):2.0946):3.87382,dog:25.46154);' tree2 = '(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.2146):0.1;' tree3 = '(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.2146);' tree4 = 'A;' tree5 = '((A,B):0.0,(C,D));' tree6 = '(Alpha,Beta,Gamma,Delta,,Epsilon,,,);' trees = [tree1, tree2, tree3, tree4, tree5, tree6] newickParser = NXNewick() # Parse newicks, adding implied roots for tree in trees: newickParser.parseString(tree, addImpliedRoots=True) answer = self.__cleanTree(tree) outputString = newickParser.writeString() logger.debug(" ***************** ") logger.debug(outputString) logger.debug(answer) assert outputString == answer # Parse newicks, not adding implied roots for tree in trees: newickParser.parseString(tree, addImpliedRoots=False) outputString = newickParser.writeString() answer = re.sub(r':[.0-9]+?;', ';', tree) answer = re.sub(r'\s+', '', answer) logger.debug(" ***************** ") logger.debug(outputString) logger.debug(answer) assert outputString == answer
def cleanEventTree(experiment): tree = MultiCactusTree(experiment.getTree()) tree.nameUnlabeledInternalNodes() for node in tree.breadthFirstTraversal(): if tree.hasName(node): name = tree.getName(node) if '.' in name: newName = name.replace('.', '_') sys.stderr.write('WARNING renaming event %s to %s\n' %(name, newName)) tree.setName(node, newName) name = newName parent = tree.getParent(node) if parent is not None: weight = tree.getWeight(parent, node) if weight is None: raise RuntimeError('Missing branch length in species_tree tree') redoPrefix = True newSuffix = 0 while redoPrefix is True: redoPrefix = False for node1 in tree.breadthFirstTraversal(): name1 = tree.getName(node1) for node2 in tree.breadthFirstTraversal(): name2 = tree.getName(node2) if node1 != node2 and name1 == name2: newName = "%s%i" % (name2, newSuffix) newSuffix += 1 tree.setName(node2, newName) sys.stderr.write('WARNING renaming event %s to %s\n' % ( name2, newName)) redoPrefix = True experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree) experiment.seqMap = experiment.buildSequenceMap()
def setUp(self): unittest.TestCase.setUp(self) self.tree = NXNewick().parseString( '((((HUMAN:0.006969,CHIMP:0.009727)anc2:0.025291,BABOON:0.044568)anc1:0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' ) self.xmlRoot = self.__makeXmlDummy() self.exp = ExperimentWrapper(self.xmlRoot) self.exp.setTree(self.tree) self.seqMap = { 'HUMAN': 'human.txt', 'CHIMP': 'chimp.txt', 'BABOON': 'baboon.txt', 'MOUSE': 'mouse.txt', 'RAT': 'rat.txt', 'DOG': 'dog.txt', 'CAT': 'cat.txt', 'PIG': 'pig.txt', 'COW': 'cow.txt' } self.exp.setRootGenome('anc1') self.exp.setRootReconstructed(True) self.exp.setOutgroupGenomes( ['MOUSE', 'RAT', 'DOG', 'CAT', 'PIG', 'COW']) for genome, seq in self.seqMap.items(): # These aren't real IDs, but should still work for our # purposes self.exp.setSequenceID(genome, seq)
def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString( treeElem.text, addImpliedRoots=False)) self.expMap = dict() self.expIDMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem if "experiment_id" in cactusPathElem.attrib: self.expIDMap[nameElem] = cactusPathElem.attrib[ "experiment_id"] self.inputSequenceMap = dict( zip(xmlRoot.attrib["inputSequenceNames"].split(), xmlRoot.attrib["inputSequences"].split())) if "inputSequenceIDs" in xmlRoot.attrib: self.inputSequenceIDMap = dict( zip(xmlRoot.attrib["inputSequenceIDNames"].split(), xmlRoot.attrib["inputSequenceIDs"].split())) if "outputSequenceIDs" in xmlRoot.attrib: self.outputSequenceIDMap = dict( zip(xmlRoot.attrib["outputSequenceNames"].split(), xmlRoot.attrib["outputSequenceIDs"].split())) logger.info("xmlRoot = %s" % ET.tostring(xmlRoot)) if "configID" in xmlRoot.attrib: self.configID = xmlRoot.attrib["configID"] self.mcTree.assignSubtreeRootNames(self.expMap)
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None, logLevel=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree() newick = NXNewick().writeString(tree) f.write('%s\n' % newick) for genome in eW.getGenomesWithSequence(): f.write('%s %s\n' % (genome, eW.getSequenceID(genome))) config = eW.getConfigPath() runCactusProgressive(seqFile, config, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats, logLevel=logLevel)
def writeXML(self, path): xmlRoot = ET.Element("multi_cactus") treeElem = ET.Element("tree") treeElem.text = NXNewick().writeString(self.mcTree) xmlRoot.append(treeElem) for name, expPath in self.expMap.items(): cactusPathElem = ET.Element("cactus") cactusPathElem.attrib["name"] = name cactusPathElem.attrib["experiment_path"] = expPath if self.expIDMap: cactusPathElem.attrib["experiment_id"] = self.expIDMap[name] xmlRoot.append(cactusPathElem) #We keep track of all the input sequences at the top level xmlRoot.attrib["inputSequences"] = " ".join( self.inputSequenceMap.values()) xmlRoot.attrib["inputSequenceNames"] = " ".join( self.inputSequenceMap.keys()) if self.inputSequenceIDMap: xmlRoot.attrib["inputSequenceIDs"] = " ".join( self.inputSequenceIDMap.values()) xmlRoot.attrib["inputSequenceIDNames"] = " ".join( self.inputSequenceIDMap.keys()) if self.outputSequenceIDMap: xmlRoot.attrib["outputSequenceIDs"] = " ".join( self.outputSequenceIDMap.values()) xmlRoot.attrib["outputSequenceNames"] = " ".join( self.outputSequenceIDMap.keys()) if self.configID: xmlRoot.attrib["configID"] = self.configID xmlFile = open(path, "w") xmlString = ET.tostring(xmlRoot) xmlString = minidom.parseString(xmlString).toprettyxml() xmlFile.write(xmlString) xmlFile.close()
def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) for name, expPath in list(mcProj.expMap.items()): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: # Outgroup name is the first element of the ogMap tuples outgroups.extend(list(map(itemgetter(0), mcProj.outgroup.ogMap[name]))) subtree = mcProj.entireTree.extractSpanningTree(children + [name] + outgroups) exp = ExperimentWrapper.createExperimentWrapper(NXNewick().writeString(subtree), children + [name] + outgroups, databaseConf=expTemplate.confElem) exp.setRootGenome(name) exp.setOutgroupGenomes(outgroups) if not os.path.exists(path): os.makedirs(path) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) if expTemplate.getSequenceID(name): exp.setRootReconstructed(False) exp.setSequenceID(name, expTemplate.getSequenceID(name)) else: exp.setRootReconstructed(True) exp.writeXML(expPath)
def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats): """Choose an arbitrary subtree from the larger species tree to run the alignment on. This function is necessary to keep runWorkflow_multipleExamples general (specifying a subtree root doesn't make sense for runCactusWorkflow). """ # Get valid internal nodes that are the root of the subtree we # want to align expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot()) tree = expWrapper.getTree() validNodes = [] for node in tree.postOrderTraversal(): if tree.hasName(node) and not tree.isLeaf(node): validNodes.append(tree.getName(node)) # Choose a random valid subtree root (NB: the entire species # tree is a valid subtree) subtreeRoot = random.choice(validNodes) logger.info("Chose subtree root %s to test from species tree " "%s" % (subtreeRoot, NXNewick().writeString(tree))) self.progressiveFunction(experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot)
def testExtractSpanningTree(self): """Tests whether extracting a binary spanning tree works correctly.""" prevNewick1 = NXNewick().writeString(self.mcTree1) # Check a dead-simple spanning tree with 3 closely related leaves. spanHCB = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "BABOON"]) # Check that the existing tree hasn't been modified (OK, a bit # silly, but just in case). self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) # Check the actual spanning tree. self.assertEqual(NXNewick().writeString(spanHCB), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;") # Now test a more complicated tree, where we should remove as # many of the ancestors as possible (they will add extra # losses for no reason!). spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"]) self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) self.assertEqual(NXNewick().writeString(spanHCC), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;")
def testSetTree(self): # A modfied version, with fewer genomes and a new one tree2 = NXNewick().parseString( '((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568,ARMADILLO:1.0);' ) self.exp.setTree(tree2) self.assertEqual(set(self.exp.getGenomesWithSequence()), set(['HUMAN', 'CHIMP', 'BABOON']))
def testAddSelf(self): trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addSelfEdges() treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueSelf)
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=True, checkpointInfo=None): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupGenomes() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read()).decode()]) if checkpointInfo: write_s3(HALPath, checkpointInfo[1], region=checkpointInfo[0]) return job.fileStore.writeGlobalFile(HALPath)
def testSequenceMap(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree seqMap = exp.buildSequenceMap() seqList = self.sequences.split() for i in seqList: assert seqMap[os.path.splitext(i)[0].upper()] == i
def testAddOutgroup(self): trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots = False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg)
def testAddOutgroup(self): trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;" tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg)
def parseFile(self, path): if not os.path.isfile(path): raise RuntimeError("File not found: %s" % path) self.tree = None self.pathMap = dict() self.outgroups = [] seqFile = open(path, "r") for l in seqFile: line = l.strip() if line: if line[0] == "#": continue tokens = line.split() if self.tree is None and (len(tokens) == 1 or line[0] == '('): newickParser = NXNewick() if not line.strip().endswith(");"): raise RuntimeError("The newick tree %s may not " "have a branch length after " "the root node." % line) try: self.tree = newickParser.parseString(line) except: raise RuntimeError("Failed to parse newick tree: %s" % line) elif len(tokens) > 0 and tokens[0] == '*': sys.stderr.write("Skipping line %s\n" % l) elif line[0] != '(' and len(tokens) >= 2: name = tokens[0] if name[0] == '*': name = name[1:] self.outgroups.append(name) path = string.join(tokens[1:]) if name in self.pathMap: raise RuntimeError("Duplicate name found: %s" % name) self.pathMap[name] = path elif len(tokens) > 0: sys.stderr.write("Skipping line %s\n" % l) if self.tree is None: self.starTree() self.cleanTree() self.validate()
def __str__(self): og_set = set(self.outgroups) s = NXNewick().writeString(self.tree) s += '\n' for name, path in self.pathMap.items(): if name in og_set: s += '*' s += '{}\t{}\n'.format(name, path) return s
def getTree(self, onlyThisSubtree=False): treeString = self.xmlRoot.attrib["species_tree"] ret = NXNewick().parseString(treeString, addImpliedRoots=False) if onlyThisSubtree: # Get a subtree containing only the reference node and its # children, rather than a species tree including the # outgroups as well multiCactus = MultiCactusTree(ret) multiCactus.nameUnlabeledInternalNodes() multiCactus.computeSubtreeRoots() ret = multiCactus.extractSubTree(self.getRootGenome()) return ret
def toXMLElement(self): assert self.tree is not None elem = ET.Element("cactus_workflow_experiment") seqString = "" for node in self.tree.postOrderTraversal(): if self.tree.isLeaf(node): name = self.tree.getName(node) path = self.pathMap[name] seqString += path + " " elem.attrib["sequences"] = seqString elem.attrib["species_tree"] = NXNewick().writeString(self.tree) elem.attrib["config"] = "defaultProgressive" return elem
def toXMLElement(self): assert self.tree is not None elem = ET.Element("cactus_workflow_experiment") for node in self.tree.postOrderTraversal(): name = self.tree.getName(node) if name in self.pathMap: path = self.pathMap[name] genomeNode = ET.SubElement(elem, "genome") genomeNode.attrib['name'] = name genomeNode.attrib['sequence'] = path elem.attrib["species_tree"] = NXNewick().writeString(self.tree) elem.attrib["config"] = "defaultProgressive" return elem
def testSanity(self): parser = NXNewick() mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False)) tree1String = NXNewick().writeString(mcTree1) self.assertEqual(tree1String, self.tree1) mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False)) tree2String = NXNewick().writeString(mcTree2) self.assertEqual(tree2String, self.tree2)
def testSubtrees(self): roots1 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"] roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5"] subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;' subTree2_a5 = '(monkey:100.8593,cat:47.14069)Anc5;' trueRoots = [roots1, roots2] trueSubtrees = [subTree1_a3, subTree2_a5] trees = [self.mcTree1, self.mcTree2] ancs = ["Anc3", "Anc5"] for tree, trueRoot, anc, trueSubtree in zip(trees, trueRoots, ancs, trueSubtrees): roots = tree.getSubtreeRootNames() self.assertEqual(sorted(roots), sorted(trueRoot)) subtree = tree.extractSubTree(anc) subtree = NXNewick().writeString(subtree) self.assertEqual(subtree, trueSubtree)
def updateTree(self, tree, seqMap = None, outgroups = None): if seqMap is not None: self.seqMap = seqMap newMap = dict() treeString = NXNewick().writeString(tree) self.xmlRoot.attrib["species_tree"] = treeString if outgroups is not None and len(outgroups) > 0: self.setOutgroupEvents(outgroups) sequences = "" for node in tree.postOrderTraversal(): if tree.isLeaf(node) or tree.getName(node) in self.getOutgroupEvents(): nodeName = tree.getName(node) if len(sequences) > 0: sequences += " " sequences += seqMap[nodeName] newMap[nodeName] = seqMap[nodeName] self.xmlRoot.attrib["sequences"] = sequences self.seqMap = newMap
def run(self): # Find all ancestral genomes using the tree. newickStr = popenCatch("halStats --tree %s" % self.halFile) tree = NXNewick().parseString(newickStr) bedFiles = {} # genome => bed files of inserted columns for nodeId in tree.postOrderTraversal(): if len(tree.getChildren(nodeId)) == 0: # leaf node, skip continue assert tree.hasName(nodeId) genome = tree.getName(nodeId) bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir()) bedFiles[genome] = bedFileForGenome self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome)) self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
def testSubtrees(self): roots1 = [ "Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7" ] roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"] roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"] subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;' subTree1a_a0 = '((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;' subTree2_a3 = '(monkey:100.8593,cat:47.14069)Anc5;' trueRoots = [roots1, roots1a, roots2] trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3] trees = [self.mcTree1, self.mcTree1a, self.mcTree2] ancs = ["Anc3", "Anc0", "Anc5"] for i in range(0, 3): roots = trees[i].getSubtreeRootNames() self.assertEqual(sorted(roots), sorted(trueRoots[i])) subtree = trees[i].extractSubTree(ancs[i]) subtree = NXNewick().writeString(subtree) self.assertEqual(subtree, trueSubtrees[i])
def setTree(self, tree): """ Load a new tree. """ # Write the new string to the XML treeString = NXNewick().writeString(tree) self.xmlRoot.attrib["species_tree"] = treeString # Ensure the changes are reflected in the genome elements # (adding and deleting elements as necessary). genomesInTree = set( tree.getName(id) for id in tree.postOrderTraversal() if tree.hasName(id)) genomeNodes = self.xmlRoot.findall('genome') genomeNamesInXML = set(node.attrib['name'] for node in genomeNodes) for node in genomeNodes: if node.attrib['name'] not in genomesInTree: self.xmlRoot.remove(node) for genome in genomesInTree: if genome not in genomeNamesInXML: node = ET.SubElement(self.xmlRoot, 'genome') node.attrib['name'] = genome
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format( experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format( totalTime, totalAppendTime)
from sonLib.bioio import fastaRead from sonLib.nxnewick import NXNewick def lengthWithoutGaps(seq): return len([i for i in seq if i != '-']) if __name__ == '__main__': # Parse args if len(sys.argv) < 3: print __doc__ sys.exit(1) newickPath = sys.argv[1] fastaPath = sys.argv[2] treeString = open(newickPath).read().split("\n")[0].strip() tree = NXNewick().parseString(treeString) sequences = {} for name, seq in fastaRead(open(fastaPath)): sequences[name] = seq # Print MAF, with sequence lines in post-order. print '##maf version=1 scoring=NA' print 'a tree="%s"' % (treeString) for nodeId in tree.postOrderTraversal(): if not tree.isLeaf(nodeId): continue nodeName = tree.getName(nodeId) if nodeName not in sequences: raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName)) seq = sequences[nodeName]