def testCandidates(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = GreedyOutgroup() og.importTree(mcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) og.greedy(candidateSet=candidates, candidateChildFrac=0.5) assert og.ogMap['Anc1'][0][0] == 'Anc4' assert og.ogMap['Anc2'][0][0] == 'Anc4' assert og.ogMap['Anc3'][0][0] == 'Anc4' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7'] assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT'] assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT'] og = GreedyOutgroup() og.importTree(mcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) candidateFrac = 1 og.greedy(candidateSet=candidates, candidateChildFrac=1.0) assert og.ogMap['Anc1'][0][0] == 'Anc7' assert og.ogMap['Anc2'][0][0] == 'Anc7' assert og.ogMap['Anc3'][0][0] == 'Anc7' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7'] assert og.ogMap['Anc6'][0][0] == 'RAT' assert og.ogMap['Anc7'][0][0] == 'RAT'
def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString( treeElem.text, addImpliedRoots=False)) self.expMap = dict() self.expIDMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem if "experiment_id" in cactusPathElem.attrib: self.expIDMap[nameElem] = cactusPathElem.attrib[ "experiment_id"] self.inputSequenceMap = dict( zip(xmlRoot.attrib["inputSequenceNames"].split(), xmlRoot.attrib["inputSequences"].split())) if "inputSequenceIDs" in xmlRoot.attrib: self.inputSequenceIDMap = dict( zip(xmlRoot.attrib["inputSequenceIDNames"].split(), xmlRoot.attrib["inputSequenceIDs"].split())) if "outputSequenceIDs" in xmlRoot.attrib: self.outputSequenceIDMap = dict( zip(xmlRoot.attrib["outputSequenceNames"].split(), xmlRoot.attrib["outputSequenceIDs"].split())) logger.info("xmlRoot = %s" % ET.tostring(xmlRoot)) if "configID" in xmlRoot.attrib: self.configID = xmlRoot.attrib["configID"] self.mcTree.assignSubtreeRootNames(self.expMap)
def testDynamicOutgroupsJustLeaves(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = DynamicOutgroup() og.importTree(mcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(mcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values()))
def testSanity(self): parser = NXNewick() mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False)) tree1String = NXNewick().writeString(mcTree1) self.assertEqual(tree1String, self.tree1) mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False)) tree2String = NXNewick().writeString(mcTree2) self.assertEqual(tree2String, self.tree2)
def __generateTrees(self): self.tree1 = '((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' self.tree2 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);' parser = NXNewick() self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots = False)) self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots = False)) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def get_leaves_and_outgroups(options, project, root): """ fish the leaves and outgroups out of the experiment xml """ # open up the experiment (as we do in ProgressiveUp.run) experimentFile = project.expMap[root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) tree = MultiCactusTree(experiment.getTree()).extractSubTree(root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() return leaves, outgroups
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outSeqDir) except: pass if not os.path.isdir(options.outSeqDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir)) if not os.access(options.outSeqDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir)) # hack the configfile to skip preprocessing and write it to the output dir if options.preprocessOnly: config.removePreprocessors() options.configFile = os.path.join(options.outSeqDir, 'config.xml') config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = seqFile.pathMap outSeqFile.outgroups = seqFile.outgroups # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename)) # write the output with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, outSeqFile))
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree, tree.degree()) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event +".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
def __generateTrees(self): self.tree1 = "((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);" self.tree2 = "((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);" parser = NXNewick() self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False)) self.mcTree1a = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False), subtreeSize=4) self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree1a.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree1a.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def createMCProject(tree, experiment, config, options): """ Creates a properly initialized MultiCactusProject. TODO: This should really all be in the constructor for MultiCactusProject. """ mcTree = MultiCactusTree(tree) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() for genome in experiment.getGenomesWithSequence(): mcProj.inputSequenceMap[genome] = experiment.getSequenceID(genome) mcProj.mcTree = mcTree if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) fillInOutgroups(mcProj, options.outgroupNames, config, alignmentRootId) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, experiment, alignmentRootId) return mcProj
def testJustLeaves(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = GreedyOutgroup() og.importTree(mcTree) candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2.) assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW'] assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc5'][0][0] == 'HUMAN' assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][0][0] == 'BABOON'
def testAddSelf(self): trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addSelfEdges() treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueSelf)
def testAddSelf(self): trueSelf = "((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;" tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addSelfEdges() treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueSelf)
def testMultipleOutgroups(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = GreedyOutgroup() og.importTree(mcTree) og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1'] assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5'] # We avoid cycles, and choose post-order first, so this only # uses leaves. assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON']
def testMultipleOutgroupsJustLeaves(self): tree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' mcTree = MultiCactusTree(NXNewick().parseString(tree, addImpliedRoots = False)) mcTree.computeSubtreeRoots() og = GreedyOutgroup() og.importTree(mcTree) candidates = set([mcTree.getName(x) for x in mcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2., maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON'] assert og.ogMap['Anc7'][0][0] == 'BABOON' assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG']
def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString(treeElem.text, addImpliedRoots=False)) self.expMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem self.inputSequences = xmlRoot.attrib["inputSequences"].split() self.outputSequenceDir = xmlRoot.attrib["outputSequenceDir"] self.mcTree.assignSubtreeRootNames(self.expMap)
def getTree(self, onlyThisSubtree=False): treeString = self.xmlRoot.attrib["species_tree"] ret = NXNewick().parseString(treeString, addImpliedRoots=False) if onlyThisSubtree: # Get a subtree containing only the reference node and its # children, rather than a species tree including the # outgroups as well multiCactus = MultiCactusTree(ret) multiCactus.nameUnlabeledInternalNodes() multiCactus.computeSubtreeRoots() ret = multiCactus.extractSubTree(self.getRootGenome()) return ret
def getTree(self, onlyThisSubtree=False): treeString = self.xmlRoot.attrib["species_tree"] ret = NXNewick().parseString(treeString, addImpliedRoots = False) if onlyThisSubtree: # Get a subtree containing only the reference node and its # children, rather than a species tree including the # outgroups as well multiCactus = MultiCactusTree(ret) multiCactus.nameUnlabeledInternalNodes() multiCactus.computeSubtreeRoots() ret = multiCactus.extractSubTree(self.getReferenceNameFromConfig()) return ret
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 500: mcTree = MultiCactusTree(tree, tree.degree()) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event +".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.mcTree1 = None self.mcTree1a = None self.mcTree2 = None self.__generateTrees() def testSanity(self): parser = NXNewick() mcTree1 = MultiCactusTree( parser.parseString(self.tree1, addImpliedRoots=False)) tree1String = NXNewick().writeString(mcTree1) self.assertEqual(tree1String, self.tree1) mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) tree2String = NXNewick().writeString(mcTree2) self.assertEqual(tree2String, self.tree2) def testSubtrees(self): roots1 = [ "Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7" ] roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"] roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"] subTree1_a3 = '(Anc7:0.025291,BABOON:0.044568)Anc3;' subTree1a_a0 = '((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;' subTree2_a3 = '(monkey:100.8593,cat:47.14069)Anc5;' trueRoots = [roots1, roots1a, roots2] trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3] trees = [self.mcTree1, self.mcTree1a, self.mcTree2] ancs = ["Anc3", "Anc0", "Anc5"] for i in range(0, 3): roots = trees[i].getSubtreeRootNames() self.assertEqual(sorted(roots), sorted(trueRoots[i])) subtree = trees[i].extractSubTree(ancs[i]) subtree = NXNewick().writeString(subtree) self.assertEqual(subtree, trueSubtrees[i]) def testAddSelf(self): trueSelf = '((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addSelfEdges() treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueSelf) def testAddOutgroup(self): trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree( parser.parseString(leafTreeString, addImpliedRoots=False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg) def testExtractSpanningTree(self): """Tests whether extracting a binary spanning tree works correctly.""" prevNewick1 = NXNewick().writeString(self.mcTree1) # Check a dead-simple spanning tree with 3 closely related leaves. spanHCB = self.mcTree1.extractSpanningTree( ["HUMAN", "CHIMP", "BABOON"]) # Check that the existing tree hasn't been modified (OK, a bit # silly, but just in case). self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) # Check the actual spanning tree. self.assertEqual( NXNewick().writeString(spanHCB), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;" ) # Now test a more complicated tree, where we should remove as # many of the ancestors as possible (they will add extra # losses for no reason!). spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"]) self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) self.assertEqual( NXNewick().writeString(spanHCC), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;") def __generateTrees(self): self.tree1 = '((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' self.tree2 = '((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);' parser = NXNewick() self.mcTree1 = MultiCactusTree( parser.parseString(self.tree1, addImpliedRoots=False)) self.mcTree1a = MultiCactusTree(parser.parseString( self.tree1, addImpliedRoots=False), subtreeSize=4) self.mcTree2 = MultiCactusTree(parser.parseString( self.tree2, addImpliedRoots=False), subtreeSize=3) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree1a.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree1a.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set( [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree, tree.degree()) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString(borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString(backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event +".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p def tearDown(self): unittest.TestCase.tearDown(self) system("rm -rf %s" % self.tempDir) def testJustLeaves(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2.) assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW'] assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc5'][0][0] == 'HUMAN' assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][0][0] == 'BABOON' def testHeightTable(self): """Make sure the height-table is calculated correctly.""" og = GreedyOutgroup() og.importTree(self.borMcTree) htable = og.heightTable() self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1) self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2) self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4) def testZeroThreshold(self): """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule.""" tree = self.backboneTree og = GreedyOutgroup() og.importTree(tree) og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']),threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75) og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75) htable = og.heightTable() for node, outgroups in og.ogMap.items(): for outgroup, _ in outgroups: # For the outgroup assignment to create no # additional dependencies, each outgroup must have # a height lower than the node it's outgroup to # (or be a leaf) self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \ or htable[tree.getNodeId(outgroup)] == 0) def testCandidates(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) og.greedy(candidateSet=candidates, candidateChildFrac=0.5) assert og.ogMap['Anc1'][0][0] == 'Anc4' assert og.ogMap['Anc2'][0][0] == 'Anc4' assert og.ogMap['Anc3'][0][0] == 'Anc4' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7'] assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT'] assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT'] og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) og.greedy(candidateSet=candidates, candidateChildFrac=1.0) assert og.ogMap['Anc1'][0][0] == 'Anc7' assert og.ogMap['Anc2'][0][0] == 'Anc7' assert og.ogMap['Anc3'][0][0] == 'Anc7' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7'] assert og.ogMap['Anc6'][0][0] == 'RAT' assert og.ogMap['Anc7'][0][0] == 'RAT' def testGeneralBetterThanLeaves(self): for tree in self.mcTrees: og1 = GreedyOutgroup() og1.importTree(tree) candidates = set([tree.getName(x) for x in tree.getLeaves()]) og1.greedy(candidateSet=candidates, candidateChildFrac=2.) og2 = GreedyOutgroup() og2.importTree(tree) og2.greedy(candidateSet=None) for i in og1.ogMap: assert i in og2.ogMap dist1 = og1.ogMap[i][0][1] dist2 = og2.ogMap[i][0][1] assert dist2 <= dist1 def testGeneralConstrainedBetterThanLeaves(self): for tree in self.mcTrees: og1 = GreedyOutgroup() og1.importTree(tree) candidates = set([tree.getName(x) for x in tree.getLeaves()]) og1.greedy(candidateSet=candidates, candidateChildFrac=2.) og2 = GreedyOutgroup() og2.importTree(tree) og2.greedy(candidateSet=None, threshold=2) for i in og1.ogMap: assert i in og2.ogMap dist1 = og1.ogMap[i][0][1] dist2 = og2.ogMap[i][0][1] assert dist2 <= dist1 def testMultipleOutgroups(self): og = GreedyOutgroup() og.importTree(self.borMcTree) og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1'] assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5'] # We avoid cycles, and choose post-order first, so this only # uses leaves. assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON'] def testMultipleOutgroupsJustLeaves(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set([self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2., maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON'] assert og.ogMap['Anc7'][0][0] == 'BABOON' assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG'] def testMultipleOutgroupsOnRandomTrees(self): for tree in self.mcTrees: og = GreedyOutgroup() og.importTree(tree) og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testDynamicOutgroupsOnRandomTrees(self): for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps): degree = max([len(tree.getChildren(x)) for x in tree.breadthFirstTraversal()]) if degree < 8: og = DynamicOutgroup() og.edgeLen = 5 og.importTree(tree, seqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. # (this will be true because all sequences are the same) assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testDynamicOutgroupsJustLeaves(self): og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testMultipleIdenticalRunsProduceSameResult(self): """The code now allows for multiple greedy() calls with different candidate sets, so that some outgroups can be 'preferred' over others without being the only candidates. Check that running greedy() multiple times with the same parameters gives the same result as running it once. """ for tree in self.mcTrees: ogOnce = GreedyOutgroup() ogOnce.importTree(tree) ogOnce.greedy(maxNumOutgroups=3) ogMultipleTimes = GreedyOutgroup() ogMultipleTimes.importTree(tree) ogMultipleTimes.greedy(maxNumOutgroups=3) ogMultipleTimes.greedy(maxNumOutgroups=3) ogMultipleTimes.greedy(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), ogMultipleTimes.ogMap.values())) # Check that the maps are equal. Can't compare them # directly since python will convert them to ordered # association lists. assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap) for i in ogOnce.ogMap: assert i in ogMultipleTimes.ogMap assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i] def testPreferredCandidateSets(self): """Test that running greedy() multiple times with different candidate sets will behave properly, i.e. keep all the existing outgroup assignments and fill in more on the second run.""" for tree in self.mcTrees: ogOnce = GreedyOutgroup() ogOnce.importTree(tree) nodes = [j for j in tree.postOrderTraversal()] candidateSet = set([tree.getName(i) for i in random.sample(nodes, min(20, len(nodes)))]) ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3) ogTwice = GreedyOutgroup() ogTwice.importTree(tree) ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3) ogTwice.greedy(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values())) # and for all entries, the closest must be first. assert all(map(lambda x: x == sorted(x, key=itemgetter(1)), ogTwice.ogMap.values())) for node in ogTwice.ogMap: if node in ogOnce.ogMap: # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice oneRunOutgroups = ogOnce.ogMap[node] twoRunOutgroups = ogTwice.ogMap[node] assert len(twoRunOutgroups) >= len(oneRunOutgroups) for i in oneRunOutgroups: assert i in twoRunOutgroups def testNoOutgroupIsADescendantOfAnother(self): """No two outgroups should be on the same path to the root.""" for tree in self.mcTrees: tree.nameUnlabeledInternalNodes() og = GreedyOutgroup() og.importTree(tree) og.greedy(maxNumOutgroups=3) for source in og.ogMap: for (sink1, _) in og.ogMap[source]: for (sink2, _) in og.ogMap[source]: if sink1 != sink2: sink1Id = tree.nameToId[sink1] sink2Id = tree.nameToId[sink2] assert sink1Id not in tree.postOrderTraversal(sink2Id) assert sink2Id not in tree.postOrderTraversal(sink1Id)
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.mcTree1 = None self.mcTree1a = None self.mcTree2 = None self.__generateTrees() def testSanity(self): parser = NXNewick() mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False)) tree1String = NXNewick().writeString(mcTree1) self.assertEqual(tree1String, self.tree1) mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) tree2String = NXNewick().writeString(mcTree2) self.assertEqual(tree2String, self.tree2) def testSubtrees(self): roots1 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4", "Anc5", "Anc6", "Anc7"] roots1a = ["Anc0", "Anc3", "Anc4", "Anc5", "Anc6"] roots2 = ["Anc0", "Anc1", "Anc2", "Anc3", "Anc4"] subTree1_a3 = "(Anc7:0.025291,BABOON:0.044568)Anc3;" subTree1a_a0 = "((Anc3:0.11,Anc4:0.260342)Anc1:0.02326,(Anc5:0.087381,Anc6:0.104728)Anc2:0.04)Anc0;" subTree2_a3 = "(monkey:100.8593,cat:47.14069)Anc5;" trueRoots = [roots1, roots1a, roots2] trueSubtrees = [subTree1_a3, subTree1a_a0, subTree2_a3] trees = [self.mcTree1, self.mcTree1a, self.mcTree2] ancs = ["Anc3", "Anc0", "Anc5"] for i in range(0, 3): roots = trees[i].getSubtreeRootNames() self.assertEqual(sorted(roots), sorted(trueRoots[i])) subtree = trees[i].extractSubTree(ancs[i]) subtree = NXNewick().writeString(subtree) self.assertEqual(subtree, trueSubtrees[i]) def testAddSelf(self): trueSelf = "((((((((HUMAN:0.006969)HUMAN_self:0.006969,(CHIMP:0.009727)CHIMP_self:0.009727)Anc7:0.025291)Anc7_self:0.025291,(BABOON:0.044568)BABOON_self:0.044568)Anc3:0.11)Anc3_self:0.11,(((MOUSE:0.072818)MOUSE_self:0.072818,(RAT:0.081244)RAT_self:0.081244)Anc4:0.260342)Anc4_self:0.260342)Anc1:0.02326)Anc1_self:0.02326,(((((DOG:0.07)DOG_self:0.07,(CAT:0.07)CAT_self:0.07)Anc5:0.087381)Anc5_self:0.087381,(((PIG:0.06)PIG_self:0.06,(COW:0.06)COW_self:0.06)Anc6:0.104728)Anc6_self:0.104728)Anc2:0.04)Anc2_self:0.04)Anc0;" tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addSelfEdges() treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueSelf) def testAddOutgroup(self): trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;" tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg) def testExtractSpanningTree(self): """Tests whether extracting a binary spanning tree works correctly.""" prevNewick1 = NXNewick().writeString(self.mcTree1) # Check a dead-simple spanning tree with 3 closely related leaves. spanHCB = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "BABOON"]) # Check that the existing tree hasn't been modified (OK, a bit # silly, but just in case). self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) # Check the actual spanning tree. self.assertEqual( NXNewick().writeString(spanHCB), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3;" ) # Now test a more complicated tree, where we should remove as # many of the ancestors as possible (they will add extra # losses for no reason!). spanHCC = self.mcTree1.extractSpanningTree(["HUMAN", "CHIMP", "CAT"]) self.assertEqual(NXNewick().writeString(self.mcTree1), prevNewick1) self.assertEqual( NXNewick().writeString(spanHCC), "((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.158551,CAT:0.197381)Anc0;" ) def __generateTrees(self): self.tree1 = "((((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568):0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);" self.tree2 = "((raccoon:19.19959,bear:6.80041):0.846,((sea_lion:11.997,seal:12.003):7.52973,((monkey:100.8593,cat:47.14069):20.59201,weasel:18.87953):2.0946):3.87382,dog:25.46154);" parser = NXNewick() self.mcTree1 = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False)) self.mcTree1a = MultiCactusTree(parser.parseString(self.tree1, addImpliedRoots=False), subtreeSize=4) self.mcTree2 = MultiCactusTree(parser.parseString(self.tree2, addImpliedRoots=False), subtreeSize=3) self.mcTree1.nameUnlabeledInternalNodes() self.mcTree1a.nameUnlabeledInternalNodes() self.mcTree2.nameUnlabeledInternalNodes() self.mcTree1.computeSubtreeRoots() self.mcTree1a.computeSubtreeRoots() self.mcTree2.computeSubtreeRoots()
def testAddOutgroup(self): trueOg = "((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;" tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree(parser.parseString(leafTreeString, addImpliedRoots=False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg)
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] mcProj.outputSequenceDir = experiment.getOutputSequenceDir() if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except Exception as e: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set([mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
def main(): options = get_options() with Toil(options) as workflow: setupBinaries(options) importSingularityImage(options) ## Preprocessing: if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq # Import asms; by default, prepends unique IDs in the technique used in cactus-blast. asms = get_asms_from_seqfile(options.seqFile, workflow) ## Perform alignments: if not workflow.options.restart: alignments = workflow.start( Job.wrapJobFn(run_cactus_reference_align, asms, options.refID, options.debug_export, options.dipcall_bed_filter, options.dipcall_vcf_filter)) else: alignments = workflow.restart() if options.debug_export: # first, ensure the debug dir exists. if not os.path.isdir(options.debug_export_dir): os.mkdir(options.debug_export_dir) print(alignments) # Then return value is: (all_primary, all_secondary, ref_mappings, primary_mappings, secondary_mappings) for asm, mapping_file in alignments[2].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".paf")) for asm, mapping_file in alignments[3].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar")) for asm, mapping_file in alignments[4].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar.secondry")) ## Save alignments: if options.dipcall_vcf_filter: # this is substantially less restrictive than the dipcall_bed_filter. dipcall_filtered = workflow.start( Job.wrapJobFn(apply_dipcall_vcf_filter, alignments[0])) workflow.exportFile(dipcall_filtered, makeURL(options.outputFile)) workflow.exportFile( alignments[1], makeURL(options.outputFile + ".unfiltered.secondary")) else: workflow.exportFile(alignments[0], makeURL(options.outputFile)) workflow.exportFile(alignments[1], makeURL(options.outputFile + ".secondary"))
class MultiCactusProject: def __init__(self): self.mcTree = None self.expMap = dict() self.expIDMap = None self.inputSequences = [] self.inputSequenceIDs = None self.outputSequenceIDMap = None self.configID = None def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString( treeElem.text, addImpliedRoots=False)) self.expMap = dict() self.expIDMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem if "experiment_id" in cactusPathElem.attrib: self.expIDMap[nameElem] = cactusPathElem.attrib[ "experiment_id"] self.inputSequences = xmlRoot.attrib["inputSequences"].split() if "inputSequenceIDs" in xmlRoot.attrib: self.inputSequenceIDs = xmlRoot.attrib["inputSequenceIDs"].split() if "outputSequenceIDs" in xmlRoot.attrib: self.outputSequenceIDMap = dict( zip(xmlRoot.attrib["outputSequenceIDs"].split(), xmlRoot.attrib["outputSequenceNames"].split())) logger.info("xmlRoot = %s" % ET.tostring(xmlRoot)) if "configID" in xmlRoot.attrib: self.configID = xmlRoot.attrib["configID"] self.mcTree.assignSubtreeRootNames(self.expMap) def writeXML(self, path): xmlRoot = ET.Element("multi_cactus") treeElem = ET.Element("tree") treeElem.text = NXNewick().writeString(self.mcTree) xmlRoot.append(treeElem) for name, expPath in self.expMap.items(): cactusPathElem = ET.Element("cactus") cactusPathElem.attrib["name"] = name cactusPathElem.attrib["experiment_path"] = expPath if self.expIDMap: cactusPathElem.attrib["experiment_id"] = self.expIDMap[name] xmlRoot.append(cactusPathElem) #We keep track of all the input sequences at the top level xmlRoot.attrib["inputSequences"] = " ".join(self.inputSequences) if self.inputSequenceIDs: xmlRoot.attrib["inputSequenceIDs"] = " ".join( self.inputSequenceIDs) if self.outputSequenceIDMap: xmlRoot.attrib["outputSequenceIDs"] = " ".join( self.outputSequenceIDMap.values()) xmlRoot.attrib["outputSequenceNames"] = " ".join( self.outputSequenceIDMap.keys()) if self.configID: xmlRoot.attrib["configID"] = self.configID xmlFile = open(path, "w") xmlString = ET.tostring(xmlRoot) xmlString = minidom.parseString(xmlString).toprettyxml() xmlFile.write(xmlString) xmlFile.close() def syncToFileStore(self, toil): self.expIDMap = dict() for name, expPath in self.expMap.items(): expWrapper = ExperimentWrapper(ET.parse(expPath).getroot()) expWrapper.setConfigID( toil.importFile("file://" + expWrapper.getConfig())) if expWrapper.getConstraintsFilePath(): expWrapper.setConstraintsID( toil.importFile("file://" + expWrapper.getConstraintsFilePath())) expWrapper.writeXML(expPath) self.expIDMap[name] = toil.importFile("file://" + expPath) def getInputSequenceIDMap(self): """Return a map between event names and sequence IDs. """ inputSequenceMap = dict() i = 0 for node in self.mcTree.postOrderTraversal(): if self.mcTree.isLeaf(node) is True: inputSequenceMap[self.mcTree.getName(node)] = \ self.inputSequenceIDs[i] i += 1 assert i == len(self.inputSequenceIDs) return inputSequenceMap def getInputSequenceIDs(self): """Get the set of input sequences for the multicactus tree """ return self.inputSequenceIDs def getInputSequencePaths(self): return self.inputSequences def setOutputSequenceIDs(self, outputSequenceIDs): self.outputSequenceIDMap = dict() i = 0 for node in self.mcTree.postOrderTraversal(): if self.mcTree.isLeaf(node) is True: self.outputSequenceIDMap[self.mcTree.getName(node)] = \ outputSequenceIDs[i] i += 1 assert i == len(outputSequenceIDs) def getOutputSequenceIDMap(self): return self.outputSequenceIDMap def getConfigPath(self): return ExperimentWrapper(ET.parse( self.expMap.values()[0]).getroot()).getConfigPath() def setConfigID(self, configID): self.configID = configID def getConfigID(self): return self.configID def setInputSequenceIDs(self, inputSequenceIDs): self.inputSequenceIDs = inputSequenceIDs
class MultiCactusProject: def __init__(self): self.mcTree = None self.expMap = dict() self.expIDMap = None self.inputSequenceMap = {} self.inputSequenceIDMap = {} self.outputSequenceIDMap = {} self.configID = None def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString( treeElem.text, addImpliedRoots=False)) self.expMap = dict() self.expIDMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem if "experiment_id" in cactusPathElem.attrib: self.expIDMap[nameElem] = cactusPathElem.attrib[ "experiment_id"] self.inputSequenceMap = dict( zip(xmlRoot.attrib["inputSequenceNames"].split(), xmlRoot.attrib["inputSequences"].split())) if "inputSequenceIDs" in xmlRoot.attrib: self.inputSequenceIDMap = dict( zip(xmlRoot.attrib["inputSequenceIDNames"].split(), xmlRoot.attrib["inputSequenceIDs"].split())) if "outputSequenceIDs" in xmlRoot.attrib: self.outputSequenceIDMap = dict( zip(xmlRoot.attrib["outputSequenceNames"].split(), xmlRoot.attrib["outputSequenceIDs"].split())) logger.info("xmlRoot = %s" % ET.tostring(xmlRoot)) if "configID" in xmlRoot.attrib: self.configID = xmlRoot.attrib["configID"] self.mcTree.assignSubtreeRootNames(self.expMap) def writeXML(self, path): xmlRoot = ET.Element("multi_cactus") treeElem = ET.Element("tree") treeElem.text = NXNewick().writeString(self.mcTree) xmlRoot.append(treeElem) for name, expPath in self.expMap.items(): cactusPathElem = ET.Element("cactus") cactusPathElem.attrib["name"] = name cactusPathElem.attrib["experiment_path"] = expPath if self.expIDMap: cactusPathElem.attrib["experiment_id"] = self.expIDMap[name] xmlRoot.append(cactusPathElem) #We keep track of all the input sequences at the top level xmlRoot.attrib["inputSequences"] = " ".join( self.inputSequenceMap.values()) xmlRoot.attrib["inputSequenceNames"] = " ".join( self.inputSequenceMap.keys()) if self.inputSequenceIDMap: xmlRoot.attrib["inputSequenceIDs"] = " ".join( self.inputSequenceIDMap.values()) xmlRoot.attrib["inputSequenceIDNames"] = " ".join( self.inputSequenceIDMap.keys()) if self.outputSequenceIDMap: xmlRoot.attrib["outputSequenceIDs"] = " ".join( self.outputSequenceIDMap.values()) xmlRoot.attrib["outputSequenceNames"] = " ".join( self.outputSequenceIDMap.keys()) if self.configID: xmlRoot.attrib["configID"] = self.configID xmlFile = open(path, "w") xmlString = ET.tostring(xmlRoot) xmlString = minidom.parseString(xmlString).toprettyxml() xmlFile.write(xmlString) xmlFile.close() def syncToFileStore(self, toil): self.expIDMap = dict() for name, expPath in self.expMap.items(): expWrapper = ExperimentWrapper(ET.parse(expPath).getroot()) expWrapper.setConfigID( toil.importFile("file://" + expWrapper.getConfigPath())) expWrapper.writeXML(expPath) self.expIDMap[name] = toil.importFile("file://" + expPath) def getConfigPath(self): return ExperimentWrapper(ET.parse( self.expMap.values()[0]).getroot()).getConfigPath() def setConfigID(self, configID): self.configID = configID def getConfigID(self): return self.configID
def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString( borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString( backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event + ".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.trees = randomTreeSet() self.mcTrees = [] self.tempDir = getTempDirectory(os.getcwd()) self.tempFa = os.path.join(self.tempDir, "seq.fa") with open(self.tempFa, "w") as f: f.write(">temp\nNNNNNNNCNNNNAAAAAAAAAAAAAAANNNNNNN\n") self.dummySeqMaps = [] for tree in self.trees: if tree.size() < 50: mcTree = MultiCactusTree(tree) seqMap = dict() for i in mcTree.breadthFirstTraversal(): mcTree.setName(i, "Node%s" % str(i)) seqMap["Node%s" % str(i)] = self.tempFa mcTree.computeSubtreeRoots() mcTree.nameUnlabeledInternalNodes() self.mcTrees.append(mcTree) self.dummySeqMaps.append(seqMap) # Boreoeutherian tree borTree = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc6:0.11,(MOUSE:0.072818,RAT:0.081244)Anc5:0.260342)Anc4:0.023260,((DOG:0.07,CAT:0.07)Anc3:0.087381,(PIG:0.06,COW:0.06)Anc2:0.104728)Anc1:0.04)Anc0;' self.borMcTree = MultiCactusTree(NXNewick().parseString( borTree, addImpliedRoots=False)) self.borMcTree.computeSubtreeRoots() self.borMcTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.borMcTree) # Eutherian backbone tree backbone = '(((((((((((Homo_sapiens:0.00655,Pan_troglodytes:0.00684):0.00422,Gorilla_gorilla_gorilla:0.008964):0.009693,Pongo_abelii:0.01894):0.015511,Macaca_mulatta:0.043601):0.08444,Aotus_nancymaae:0.08):0.08,Microcebus_murinus:0.10612):0.043494,Galeopterus_variegatus:0.134937):0.04,((((Jaculus_jaculus:0.1,(Microtus_ochrogaster:0.14,(Mus_musculus:0.084509,Rattus_norvegicus:0.091589):0.047773):0.06015):0.122992,(Heterocephalus_glaber:0.1,(Cavia_porcellus:0.065629,(Chinchilla_lanigera:0.06,Octodon_degus:0.1):0.06):0.05):0.06015):0.05,Marmota_marmota:0.1):0.05,Oryctolagus_cuniculus:0.21569):0.04):0.040593,(((Sus_scrofa:0.12,(Orcinus_orca:0.069688,(Bos_taurus:0.04,Capra_hircus:0.04):0.09):0.045488):0.02,((Equus_caballus:0.109397,(Felis_catus:0.098612,(Canis_lupus_familiaris:0.052458,Mustela_putorius_furo:0.08):0.02):0.049845):0.02,(Pteropus_alecto:0.1,Eptesicus_fuscus:0.08):0.033706):0.03):0.025,Erinaceus_europaeus:0.278178):0.021227):0.023664,(((Loxodonta_africana:0.022242,Procavia_capensis:0.145358):0.076687,Chrysochloris_asiatica:0.04):0.05,Dasypus_novemcinctus:0.169809):0.02)backbone_root:0.234728,(Monodelphis_domestica:0.125686,Sarcophilus_harrisii:0.12):0.2151);' self.backboneTree = MultiCactusTree(NXNewick().parseString( backbone, addImpliedRoots=False)) self.backboneTree.computeSubtreeRoots() self.backboneTree.nameUnlabeledInternalNodes() self.mcTrees.append(self.backboneTree) seqLens = dict() seqLens["HUMAN"] = 57553 seqLens["CHIMP"] = 57344 seqLens["BABOON"] = 58960 seqLens["MOUSE"] = 32750 seqLens["RAT"] = 38436 seqLens["DOG"] = 54187 seqLens["CAT"] = 50283 seqLens["PIG"] = 54843 seqLens["COW"] = 55508 self.blanchetteSeqMap = dict() for event, seqLen in seqLens.items(): p = os.path.join(self.tempDir, event + ".fa") with open(p, "w") as f: f.write(">%s\n" % event) f.write(''.join(['A'] * seqLen)) f.write('\n') self.blanchetteSeqMap[event] = p def tearDown(self): unittest.TestCase.tearDown(self) system("rm -rf %s" % self.tempDir) def testJustLeaves(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set( [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2.) assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc2'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc3'][0][0] in ['PIG', 'COW'] assert og.ogMap['Anc4'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc5'][0][0] == 'HUMAN' assert og.ogMap['Anc6'][0][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][0][0] == 'BABOON' def testHeightTable(self): """Make sure the height-table is calculated correctly.""" og = GreedyOutgroup() og.importTree(self.borMcTree) htable = og.heightTable() self.assertEquals(htable[self.borMcTree.getNodeId('HUMAN')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('PIG')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('RAT')], 0) self.assertEquals(htable[self.borMcTree.getNodeId('Anc7')], 1) self.assertEquals(htable[self.borMcTree.getNodeId('Anc1')], 2) self.assertEquals(htable[self.borMcTree.getNodeId('Anc0')], 4) def testZeroThreshold(self): """A threshold of 0 should produce outgroup sets that cause no additional depth in the resulting schedule.""" tree = self.backboneTree og = GreedyOutgroup() og.importTree(tree) og.greedy(candidateSet=set(['Homo_sapiens', 'Mus_musculus']), threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75) og.greedy(threshold=0, maxNumOutgroups=3, candidateChildFrac=0.75) htable = og.heightTable() for node, outgroups in og.ogMap.items(): for outgroup, _ in outgroups: # For the outgroup assignment to create no # additional dependencies, each outgroup must have # a height lower than the node it's outgroup to # (or be a leaf) self.assertTrue(htable[tree.getNodeId(outgroup)] < htable[tree.getNodeId(node)] \ or htable[tree.getNodeId(outgroup)] == 0) def testCandidates(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) og.greedy(candidateSet=candidates, candidateChildFrac=0.5) assert og.ogMap['Anc1'][0][0] == 'Anc4' assert og.ogMap['Anc2'][0][0] == 'Anc4' assert og.ogMap['Anc3'][0][0] == 'Anc4' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc6', 'Anc7'] assert og.ogMap['Anc6'][0][0] in ['Anc5', 'MOUSE', 'RAT'] assert og.ogMap['Anc7'][0][0] in ['Anc5', 'MOUSE', 'RAT'] og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set(['HUMAN', 'CHIMP', 'RAT']) og.greedy(candidateSet=candidates, candidateChildFrac=1.0) assert og.ogMap['Anc1'][0][0] == 'Anc7' assert og.ogMap['Anc2'][0][0] == 'Anc7' assert og.ogMap['Anc3'][0][0] == 'Anc7' assert 'Anc4' not in og.ogMap assert og.ogMap['Anc5'][0][0] in ['HUMAN', 'CHIMP', 'Anc7'] assert og.ogMap['Anc6'][0][0] == 'RAT' assert og.ogMap['Anc7'][0][0] == 'RAT' def testGeneralBetterThanLeaves(self): for tree in self.mcTrees: og1 = GreedyOutgroup() og1.importTree(tree) candidates = set([tree.getName(x) for x in tree.getLeaves()]) og1.greedy(candidateSet=candidates, candidateChildFrac=2.) og2 = GreedyOutgroup() og2.importTree(tree) og2.greedy(candidateSet=None) for i in og1.ogMap: assert i in og2.ogMap dist1 = og1.ogMap[i][0][1] dist2 = og2.ogMap[i][0][1] assert dist2 <= dist1 def testGeneralConstrainedBetterThanLeaves(self): for tree in self.mcTrees: og1 = GreedyOutgroup() og1.importTree(tree) candidates = set([tree.getName(x) for x in tree.getLeaves()]) og1.greedy(candidateSet=candidates, candidateChildFrac=2.) og2 = GreedyOutgroup() og2.importTree(tree) og2.greedy(candidateSet=None, threshold=2) for i in og1.ogMap: assert i in og2.ogMap dist1 = og1.ogMap[i][0][1] dist2 = og2.ogMap[i][0][1] assert dist2 <= dist1 def testMultipleOutgroups(self): og = GreedyOutgroup() og.importTree(self.borMcTree) og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc4']) == ['Anc1'] assert map(itemgetter(0), og.ogMap['Anc7']) == ['BABOON', 'Anc1', 'Anc5'] # We avoid cycles, and choose post-order first, so this only # uses leaves. assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON'] def testMultipleOutgroupsJustLeaves(self): og = GreedyOutgroup() og.importTree(self.borMcTree) candidates = set( [self.borMcTree.getName(x) for x in self.borMcTree.getLeaves()]) og.greedy(candidateSet=candidates, candidateChildFrac=2., maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert map(itemgetter(0), og.ogMap['Anc1']) == ['HUMAN', 'CHIMP', 'BABOON'] assert og.ogMap['Anc7'][0][0] == 'BABOON' assert og.ogMap['Anc7'][1][0] in ['CAT', 'DOG'] assert og.ogMap['Anc7'][2][0] in ['CAT', 'DOG'] def testMultipleOutgroupsOnRandomTrees(self): for tree in self.mcTrees: og = GreedyOutgroup() og.importTree(tree) og.greedy(candidateChildFrac=0.5, maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testDynamicOutgroupsOnRandomTrees(self): for tree, seqMap in zip(self.mcTrees, self.dummySeqMaps): degree = max([ len(tree.getChildren(x)) for x in tree.breadthFirstTraversal() ]) if degree < 8: og = DynamicOutgroup() og.edgeLen = 5 og.importTree(tree, seqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. # (this will be true because all sequences are the same) assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testDynamicOutgroupsJustLeaves(self): og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3, sequenceLossWeight=0.) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) # ordering is important! assert og.ogMap['Anc1'][0][0] == 'HUMAN' assert og.ogMap['Anc7'][0][0] == 'BABOON' og = DynamicOutgroup() og.importTree(self.borMcTree, self.blanchetteSeqMap) og.compute(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, og.ogMap.values())) # we keep dynamic outgroups sorted by distance too assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), og.ogMap.values())) def testMultipleIdenticalRunsProduceSameResult(self): """The code now allows for multiple greedy() calls with different candidate sets, so that some outgroups can be 'preferred' over others without being the only candidates. Check that running greedy() multiple times with the same parameters gives the same result as running it once. """ for tree in self.mcTrees: ogOnce = GreedyOutgroup() ogOnce.importTree(tree) ogOnce.greedy(maxNumOutgroups=3) ogMultipleTimes = GreedyOutgroup() ogMultipleTimes.importTree(tree) ogMultipleTimes.greedy(maxNumOutgroups=3) ogMultipleTimes.greedy(maxNumOutgroups=3) ogMultipleTimes.greedy(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all( map(lambda x: len(x) <= 3, ogMultipleTimes.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), ogMultipleTimes.ogMap.values())) # Check that the maps are equal. Can't compare them # directly since python will convert them to ordered # association lists. assert len(ogOnce.ogMap) == len(ogMultipleTimes.ogMap) for i in ogOnce.ogMap: assert i in ogMultipleTimes.ogMap assert ogOnce.ogMap[i] == ogMultipleTimes.ogMap[i] def testPreferredCandidateSets(self): """Test that running greedy() multiple times with different candidate sets will behave properly, i.e. keep all the existing outgroup assignments and fill in more on the second run.""" for tree in self.mcTrees: ogOnce = GreedyOutgroup() ogOnce.importTree(tree) nodes = [j for j in tree.postOrderTraversal()] candidateSet = set([ tree.getName(i) for i in random.sample(nodes, min(20, len(nodes))) ]) ogOnce.greedy(candidateSet=candidateSet, maxNumOutgroups=3) ogTwice = GreedyOutgroup() ogTwice.importTree(tree) ogTwice.greedy(candidateSet=candidateSet, maxNumOutgroups=3) ogTwice.greedy(maxNumOutgroups=3) # make sure all entries have <= 3 outgroups. assert all(map(lambda x: len(x) <= 3, ogTwice.ogMap.values())) # and for all entries, the closest must be first. assert all( map(lambda x: x == sorted(x, key=itemgetter(1)), ogTwice.ogMap.values())) for node in ogTwice.ogMap: if node in ogOnce.ogMap: # the ogMap entry in ogOnce should be a subset of the ogMap entry for ogTwice oneRunOutgroups = ogOnce.ogMap[node] twoRunOutgroups = ogTwice.ogMap[node] assert len(twoRunOutgroups) >= len(oneRunOutgroups) for i in oneRunOutgroups: assert i in twoRunOutgroups def testNoOutgroupIsADescendantOfAnother(self): """No two outgroups should be on the same path to the root.""" for tree in self.mcTrees: tree.nameUnlabeledInternalNodes() og = GreedyOutgroup() og.importTree(tree) og.greedy(maxNumOutgroups=3) for source in og.ogMap: for (sink1, _) in og.ogMap[source]: for (sink2, _) in og.ogMap[source]: if sink1 != sink2: sink1Id = tree.nameToId[sink1] sink2Id = tree.nameToId[sink2] assert sink1Id not in tree.postOrderTraversal( sink2Id) assert sink2Id not in tree.postOrderTraversal( sink1Id)
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def testAddOutgroup(self): trueOg = '((((HUMAN:0.006969,CHIMP:0.009727)Anc7:0.025291,BABOON:0.044568)Anc3:0.11,(MOUSE:0.072818,RAT:0.081244)Anc4:0.260342)Anc1:0.02326,((DOG:0.07,CAT:0.07)Anc5:0.087381,(PIG:0.06,COW:0.06)Anc6:0.104728)Anc2:0.04,outgroup:1.7)Anc0;' tree = MultiCactusTree(self.mcTree1) tree.nameUnlabeledInternalNodes() tree.computeSubtreeRoots() tree.addOutgroup("outgroup", 1.7) treeString = NXNewick().writeString(tree) self.assertEqual(treeString, trueOg) trueLeafOg = "(A:1.1,outgroup:1.1);" leafTreeString = "A;" parser = NXNewick() leafTree = MultiCactusTree( parser.parseString(leafTreeString, addImpliedRoots=False)) leafTree.nameUnlabeledInternalNodes() leafTree.computeSubtreeRoots() leafTree.addOutgroup("outgroup", 2.2) leafTreeOutString = NXNewick().writeString(leafTree) self.assertEqual(leafTreeOutString, trueLeafOg)
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # import the outgroups outgroupIDs = [] cactus_blast_input = not options.nonBlastInput for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(options.blastOutput) + '.og_fragment_{}'.format(i)) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) except: if cactus_blast_input: raise # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not cactus_blast_input and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not cactus_blast_input: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(options.blastOutput)) try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(options.blastOutput) + '.secondary') except: workFlowArgs.secondaryAlignmentsID = None workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if cactus_blast_input and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(options.blastOutput) + '.ig_coverage_{}'.format(i))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, cactus_blast_input)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def runCactusGraphMap(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "refgraph"), "assemblyName", default="__MINIGRAPH_SEQUENCES__") # load the seqfile seqFile = SeqFile(options.seqFile) logger.info("Genomes for graphmap, {}".format(seqFile.pathMap)) if not options.outputFasta and graph_event not in seqFile.pathMap: raise RuntimeError( "{} assembly not found in seqfile so it must be specified with --outputFasta" .format(graph_event)) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} for genome, seq in seqFile.pathMap.items(): if genome != graph_event: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDMap[genome] = toil.importFile(seq) # run the workflow paf_id, gfa_fa_id = toil.start( Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) if gfa_fa_id: toil.exportFile(gfa_fa_id, makeURL(options.outputFasta)) # update the input seqfile (in place!) add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))
class MultiCactusProject: def __init__(self): self.mcTree = None self.expMap = dict() self.inputSequences = [] self.outputSequenceDir = None def readXML(self, path): xmlRoot = ET.parse(path).getroot() treeElem = xmlRoot.find("tree") self.mcTree = MultiCactusTree(NXNewick().parseString(treeElem.text, addImpliedRoots=False)) self.expMap = dict() cactusPathElemList = xmlRoot.findall("cactus") for cactusPathElem in cactusPathElemList: nameElem = cactusPathElem.attrib["name"] pathElem = cactusPathElem.attrib["experiment_path"] self.expMap[nameElem] = pathElem self.inputSequences = xmlRoot.attrib["inputSequences"].split() self.outputSequenceDir = xmlRoot.attrib["outputSequenceDir"] self.mcTree.assignSubtreeRootNames(self.expMap) def writeXML(self, path): xmlRoot = ET.Element("multi_cactus") treeElem = ET.Element("tree") treeElem.text = NXNewick().writeString(self.mcTree) xmlRoot.append(treeElem) for name, expPath in self.expMap.items(): cactusPathElem = ET.Element("cactus") cactusPathElem.attrib["name"] = name cactusPathElem.attrib["experiment_path"] = expPath xmlRoot.append(cactusPathElem) # We keep track of all the input sequences at the top level xmlRoot.attrib["inputSequences"] = " ".join(self.inputSequences) xmlRoot.attrib["outputSequenceDir"] = self.outputSequenceDir xmlFile = open(path, "w") xmlString = ET.tostring(xmlRoot) xmlString = minidom.parseString(xmlString).toprettyxml() xmlFile.write(xmlString) xmlFile.close() # find the sequence associated with an event name # by digging out the appropriate experiment file # doesn't work for the root!!!! def sequencePath(self, eventName): parentEvent = self.mcTree.getSubtreeRoot(eventName) expPath = self.expMap[parentEvent] expElem = ET.parse(expPath).getroot() exp = ExperimentWrapper(expElem) seq = exp.getSequence(eventName) assert os.path.isfile(seq) return seq def getInputSequenceMap(self): """Return a map between event names and sequence paths. Paths are different from above in that they are not taken from experiment xmls, but rather from directly from the project xml. """ inputSequenceMap = dict() i = 0 for node in self.mcTree.postOrderTraversal(): if self.mcTree.isLeaf(node) is True: inputSequenceMap[self.mcTree.getName(node)] = self.inputSequences[i] i += 1 assert i == len(self.inputSequences) return inputSequenceMap def getInputSequencePaths(self): """Get the set of input sequences for the multicactus tree """ return self.inputSequences def getOutputSequenceDir(self): """The directory where the output sequences go """ return self.outputSequenceDir def getConfigPath(self): return ExperimentWrapper(ET.parse(self.expMap.values()[0]).getroot()).getConfigPath()
def cleanEventTree(experiment): tree = MultiCactusTree(experiment.getTree()) tree.nameUnlabeledInternalNodes() for node in tree.breadthFirstTraversal(): if tree.hasName(node): name = tree.getName(node) if '.' in name: newName = name.replace('.', '_') sys.stderr.write('WARNING renaming event %s to %s\n' % (name, newName)) tree.setName(node, newName) name = newName parent = tree.getParent(node) if parent is not None: weight = tree.getWeight(parent, node) if weight is None: raise RuntimeError( 'Missing branch length in species_tree tree') redoPrefix = True newSuffix = 0 while redoPrefix is True: redoPrefix = False for node1 in tree.breadthFirstTraversal(): name1 = tree.getName(node1) for node2 in tree.breadthFirstTraversal(): name2 = tree.getName(node2) if node1 != node2 and name1 == name2: newName = "%s%i" % (name2, newSuffix) newSuffix += 1 tree.setName(node2, newName) sys.stderr.write('WARNING renaming event %s to %s\n' % (name2, newName)) redoPrefix = True experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree) experiment.seqMap = experiment.buildSequenceMap()
def cleanEventTree(experiment): tree = MultiCactusTree(experiment.getTree()) tree.nameUnlabeledInternalNodes() for node in tree.breadthFirstTraversal(): if tree.hasName(node): name = tree.getName(node) if '.' in name: newName = name.replace('.', '_') sys.stderr.write('WARNING renaming event %s to %s\n' %(name, newName)) tree.setName(node, newName) name = newName parent = tree.getParent(node) if parent is not None: weight = tree.getWeight(parent, node) if weight is None: raise RuntimeError('Missing branch length in species_tree tree') redoPrefix = True newSuffix = 0 while redoPrefix is True: redoPrefix = False for node1 in tree.breadthFirstTraversal(): name1 = tree.getName(node1) for node2 in tree.breadthFirstTraversal(): name2 = tree.getName(node2) if node1 != node2 and name1 == name2: newName = "%s%i" % (name2, newSuffix) newSuffix += 1 tree.setName(node2, newName) sys.stderr.write('WARNING renaming event %s to %s\n' % ( name2, newName)) redoPrefix = True experiment.xmlRoot.attrib["species_tree"] = NXNewick().writeString(tree) experiment.seqMap = experiment.buildSequenceMap()
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job