def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) seqMap = expTemplate.seqMap portOffset = 0 for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) seqMap[name] = os.path.join(path, name + '.fa') for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) exp = copy.deepcopy(expTemplate) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: for og, ogDist in mcProj.outgroup.ogMap[name]: assert og in seqMap, "No sequence found for outgroup: %s" % og outgroups += [og] # Get subtree connecting children + outgroups assert len(children) > 0 subtree = mcProj.entireTree.extractSpanningTree(children + outgroups) exp.updateTree(subtree, seqMap, outgroups) exp.setConfigPath(os.path.join(path, "%s_config.xml" % name)) if not os.path.exists(path): os.makedirs(path) exp.writeXML(expPath) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) config.setReferenceName(name) config.writeXML(exp.getConfigPath())
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options): self.options = options self.seqFile = SeqFile(options.seqFile) self.workingDir = options.cactusDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath(os.path.join(self.workingDir, "config.xml")) expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if len(self.seqFile.outgroups) == 0: # No outgroups specified, assume the default outgroup set outgroups = None else: outgroups = self.seqFile.outgroups runCreateMultiCactusProject(expPath, projPath, fixNames=0, outgroupNames=outgroups, root=self.options.root)
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options): self.options = options self.seqFile = SeqFile(options.seqFile) self.workingDir = options.cactusDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath( os.path.join(self.workingDir, "config.xml")) expPath = absSymPath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if len(self.seqFile.outgroups) == 0: # No outgroups specified, assume the default outgroup set outgroups = None else: outgroups = self.seqFile.outgroups runCreateMultiCactusProject(expPath, projPath, fixNames=0, outgroupNames=outgroups, root=self.options.root)
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outSeqDir) except: pass if not os.path.isdir(options.outSeqDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir)) if not os.access(options.outSeqDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir)) # hack the configfile to skip preprocessing and write it to the output dir if options.preprocessOnly: config.removePreprocessors() options.configFile = os.path.join(options.outSeqDir, 'config.xml') config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = seqFile.pathMap outSeqFile.outgroups = seqFile.outgroups # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename)) # write the output with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, outSeqFile))
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() configWrapper.turnOffHeaderChecks() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) for name, expPath in list(mcProj.expMap.items()): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: # Outgroup name is the first element of the ogMap tuples outgroups.extend( list(map(itemgetter(0), mcProj.outgroup.ogMap[name]))) subtree = mcProj.entireTree.extractSpanningTree(children + [name] + outgroups) exp = ExperimentWrapper.createExperimentWrapper( NXNewick().writeString(subtree), children + [name] + outgroups, databaseConf=expTemplate.confElem) exp.setRootGenome(name) exp.setOutgroupGenomes(outgroups) if not os.path.exists(path): os.makedirs(path) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) if expTemplate.getSequenceID(name): exp.setRootReconstructed(False) exp.setSequenceID(name, expTemplate.getSequenceID(name)) else: exp.setRootReconstructed(True) exp.writeXML(expPath) config.writeXML(exp.getConfigPath())
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = os.path.abspath( os.path.join(self.workingDir, "config.xml")) expPath = os.path.abspath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames=1 else: fixNames=0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath+ "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join(oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir( os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath(os.path.join(self.workingDir, "config.xml")) expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames = 1 else: fixNames = 0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath + "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py \"%s\" \"%s\" --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath # Fix for relative directories if oldPath[0:2] == './': oldPath = oldPath[2:] if tempPath[0:2] == './': tempPath = tempPath[2:] if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) projFilePathNew = os.path.join( tempPath, '%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join( oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) seqMap = expTemplate.seqMap portOffset = 0 for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) seqMap[name] = os.path.join(path, name + '.fa') for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) exp = copy.deepcopy(expTemplate) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: for og, ogDist in mcProj.outgroup.ogMap[name]: if og in seqMap: ogPath = seqMap[og] else: ogPath = os.path.join(options.path, og) ogPath = os.path.join(ogPath, refFileName(og)) seqMap[og] = ogPath outgroups += [og] # Get subtree connecting children + outgroups assert len(children) > 0 subtree = mcProj.entireTree.extractSpanningTree(children + outgroups) dbBase = path if expTemplate.getDbDir() is not None: dbBase = os.path.abspath(expTemplate.getDbDir()) exp.setDbDir(os.path.join(dbBase, name, "%s_DB" % name)) if expTemplate.getDbType() == "kyoto_tycoon" and \ os.path.splitext(name)[1] != ".kch": exp.setDbName("%s.kch" % name) else: exp.setDbName(name) if expTemplate.getDbType() == "kyoto_tycoon": exp.setDbPort(expTemplate.getDbPort() + portOffset) portOffset += 1 host = expTemplate.getDbHost() if host is not None: exp.setDbHost(host) exp.setReferencePath(os.path.join(path, name + '.fa')) if configTemplate.getBuildHal() == True: exp.setHALPath(os.path.join(path, "%s_hal.c2h" % name)) if configTemplate.getBuildFasta() == True: exp.setHALFastaPath(os.path.join(path, "%s_hal.fa" % name)) exp.updateTree(subtree, seqMap, outgroups) exp.setConfigPath(os.path.join(path, "%s_config.xml" % name)) if not os.path.exists(exp.getDbDir()): os.makedirs(exp.getDbDir()) if not os.path.exists(path): os.makedirs(path) exp.writeXML(expPath) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) config.setReferenceName(name) config.verifyMinBlockDegree(exp) config.writeXML(exp.getConfigPath())