def createMCProject(tree, experiment, config, options): """ Creates a properly initialized MultiCactusProject. TODO: This should really all be in the constructor for MultiCactusProject. """ mcTree = MultiCactusTree(tree) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() for genome in experiment.getGenomesWithSequence(): mcProj.inputSequenceMap[genome] = experiment.getSequenceID(genome) mcProj.mcTree = mcTree if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) fillInOutgroups(mcProj, options.outgroupNames, config, alignmentRootId) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, experiment, alignmentRootId) return mcProj
def extractOutput(workDir, outputHalFile, options): if options.outputMaf is not None: mcProj = MultiCactusProject() mcProj.readXML( os.path.join(workDir, ProjectWrapper.alignmentDirName, ProjectWrapper.alignmentDirName + "_project.xml")) rootName = mcProj.mcTree.getRootName() rootPath = os.path.join(workDir, ProjectWrapper.alignmentDirName, rootName, rootName + '.maf') cmd = 'mv %s %s' % (rootPath, options.outputMaf) system(cmd) envFile = getEnvFilePath() logFile = os.path.join(workDir, 'cactus.log') pjPath = os.path.join(workDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) logHandle = open(logFile, "a") logHandle.write("\n\n%s: Beginning HAL Export\n\n" % str( datetime.datetime.now())) logHandle.close() cmd = '. %s && cactus2hal.py %s %s >> %s 2>&1' % (envFile, pjPath, outputHalFile, logFile) system(cmd) logHandle = open(logFile, "a") logHandle.write("\n%s: Finished HAL Export \n" % str( datetime.datetime.now())) logHandle.close()
def __pollKtServers(self): self.curKtservers = set() try: mc = MultiCactusProject() mc.readXML(self.projectPath) for eventName,expPath in mc.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) try: if pingKtServer(exp): self.curKtservers.add("%s_%s:%s" % ( eventName, exp.getDbHost(), str(exp.getDbPort()))) except: pass try: secElem = exp.getSecondaryDBElem() if secElem is not None and pingKtServer(secElem): self.curKtservers.add("%s_secondary_%s:%s" % ( eventName, secElem.getDbHost(), str(secElem.getDbPort()))) except: pass except: self.curKtservers = set() if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\ self.curKtservers == self.prevKtservers: self.sameKtserversTime += self.pollTime else: self.prevKtservers = set(self.curKtservers) self.sameKtserversTime = 0
def __pollKtServers(self): self.curKtservers = set() try: mc = MultiCactusProject() mc.readXML(self.projectPath) for eventName, expPath in mc.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) try: if pingKtServer(exp): self.curKtservers.add( "%s_%s:%s" % (eventName, exp.getDbHost(), str(exp.getDbPort()))) except: pass try: secElem = exp.getSecondaryDBElem() if secElem is not None and pingKtServer(secElem): self.curKtservers.add("%s_secondary_%s:%s" % (eventName, secElem.getDbHost(), str(secElem.getDbPort()))) except: pass except: self.curKtservers = set() if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\ self.curKtservers == self.prevKtservers: self.sameKtserversTime += self.pollTime else: self.prevKtservers = set(self.curKtservers) self.sameKtserversTime = 0
def main(): usage = "usage: %prog <project> <output graphviz .dot file>" description = "TEST: draw the outgroup DAG" parser = OptionParser(usage=usage, description=description) parser.add_option("--justLeaves", dest="justLeaves", action="store_true", default=False, help="Assign only leaves as outgroups") parser.add_option("--threshold", dest="threshold", type='int', default=None, help="greedy threshold") parser.add_option("--numOutgroups", dest="maxNumOutgroups", help="Maximum number of outgroups to provide", type=int) parser.add_option("--dynamic", help="Use new dynamic programming" " algorithm", action="store_true", default=False) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") proj = MultiCactusProject() proj.readXML(args[0]) if not options.dynamic: outgroup = GreedyOutgroup() outgroup.importTree(proj.mcTree) if options.justLeaves: candidates = set( [proj.mcTree.getName(x) for x in proj.mcTree.getLeaves()]) else: candidates = None outgroup.greedy(threshold=options.threshold, candidateSet=candidates, candidateChildFrac=1.1, maxNumOutgroups=options.maxNumOutgroups) else: outgroup = DynamicOutgroup() outgroup.importTree(proj.mcTree, proj.getInputSequenceMap()) outgroup.compute(options.maxNumOutgroups) try: NX.drawing.nx_agraph.write_dot(outgroup.dag, args[1]) except Exception as e: print "NetworkX failed: %s" % str(e) print "Writing ogMap in non-graphviz format" with open(args[1], "w") as f: for node, ogs in outgroup.ogMap.items(): f.write("%s -> %s\n" % (node, str(ogs))) return 0
def readProject(self, projectXmlPath): mcProj = MultiCactusProject() mcProj.readXML(projectXmlPath) mcProj.mcTree.nameUnlabeledInternalNodes() self.nameMap = dict() for leaf in mcProj.mcTree.getLeaves(): eventName = mcProj.mcTree.getName(leaf) sequencePath = mcProj.sequencePath(eventName) for sequenceFile in fileList(sequencePath): if not os.path.isdir(sequenceFile): self.processSequence(eventName, sequenceFile)
def main(): usage = "usage: %prog <project> <output graphviz .dot file>" description = "TEST: draw the outgroup DAG" parser = OptionParser(usage=usage, description=description) parser.add_option("--justLeaves", dest="justLeaves", action="store_true", default = False, help="Assign only leaves as outgroups") parser.add_option("--threshold", dest="threshold", type='int', default = None, help="greedy threshold") parser.add_option("--numOutgroups", dest="maxNumOutgroups", help="Maximum number of outgroups to provide", type=int) parser.add_option("--dynamic", help="Use new dynamic programming" " algorithm", action="store_true", default=False) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") proj = MultiCactusProject() proj.readXML(args[0]) if not options.dynamic: outgroup = GreedyOutgroup() outgroup.importTree(proj.mcTree) if options.justLeaves: candidates = set([proj.mcTree.getName(x) for x in proj.mcTree.getLeaves()]) else: candidates = None outgroup.greedy(threshold=options.threshold, candidateSet=candidates, candidateChildFrac=1.1, maxNumOutgroups=options.maxNumOutgroups) else: outgroup = DynamicOutgroup() outgroup.importTree(proj.mcTree, proj.getInputSequenceMap()) outgroup.compute(options.maxNumOutgroups) try: NX.drawing.nx_agraph.write_dot(outgroup.dag, args[1]) except Exception as e: print "NetworkX failed: %s" % str(e) print "Writing ogMap in non-graphviz format" with open(args[1], "w") as f: for node, ogs in outgroup.ogMap.items(): f.write("%s -> %s\n" % (node, str(ogs))) return 0
def main(): usage = "usage: %prog <project> <output graphviz .dot file>" description = "TEST: create schedule from project file" parser = OptionParser(usage=usage, description=description) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") proj = MultiCactusProject() proj.readXML(args[0]) schedule = Schedule() schedule.loadProject(proj) schedule.compute() schedule.writeToFile(args[1])
def addRow(self, catName, params, jobTreeStatsPath, mafCompPath, treeStatsPath, projPath): if os.path.isfile(jobTreeStatsPath) and os.path.isfile(mafCompPath) and os.path.isfile(treeStatsPath): mafXmlRoot = ET.parse(mafCompPath).getroot() jtXmlRoot = ET.parse(jobTreeStatsPath).getroot() tsXmlRoot = ET.parse(treeStatsPath).getroot() if projPath is not None: project = MultiCactusProject() project.readXML(projPath) else: project = None row = params.asRow() row.extend(self.__jtStats(jtXmlRoot)) row.extend(self.__totalAggregate(mafXmlRoot)) row.extend(self.__growthStats(project)) row.extend(self.__cactusTreeStats(tsXmlRoot)) row.extend(self.__speciesAggregate(mafXmlRoot)) rowstring = str(row) self.table.append(row)
def run(self): #Load the multi-cactus project project = MultiCactusProject() project.readXML(self.args[0]) #Create jobs to create the output sequences configNode = ET.parse(project.getConfigPath()).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Create the preprocessor self.addChildTarget(CactusPreprocessor(project.getInputSequencePaths(), CactusPreprocessor.getOutputSequenceFiles(project.getInputSequencePaths(), project.getOutputSequenceDir()), configNode)) #Now build the progressive-down target schedule = Schedule() schedule.loadProject(project) schedule.compute() if self.options.event == None: self.options.event = project.mcTree.getRootName() assert self.options.event in project.expMap leafNames = [ project.mcTree.getName(i) for i in project.mcTree.getLeaves() ] self.options.globalLeafEventSet = set(leafNames) self.setFollowOnTarget(ProgressiveDown(self.options, project, self.options.event, schedule))
def updateProject(path): mcProj = MultiCactusProject() mcProj.readXML(path) basePath, name = os.path.split(path) for name,oldPath in mcProj.expMap.items(): fileName = os.path.basename(oldPath) dirName = os.path.dirname(oldPath).rpartition('/')[2] newPath = os.path.join(basePath, dirName, fileName) if not os.path.isfile(newPath): raise RuntimeError("Experiment file %s not found\n" % newPath) mcProj.expMap[name] = newPath exp = ExperimentWrapper(ET.parse(newPath).getroot()) oldDbDir = exp.getDbDir() if oldDbDir is not None: dbDirName = oldDbDir[oldDbDir.find(name):] newDbDir = os.path.join(basePath, dbDirName) exp.setDbDir(newDbDir) oldRefPath = exp.getReferencePath() if oldRefPath is not None: refName = oldRefPath[oldRefPath.find(name):] newRefPath = os.path.join(basePath, refName) exp.setReferencePath(newRefPath) oldHalPath = exp.getHALPath() if oldHalPath is not None: halName = oldHalPath[oldHalPath.find(name):] newHalPath = os.path.join(basePath, halName) exp.setHALPath(newHalPath) oldHalFastaPath = exp.getHALFastaPath() if oldHalFastaPath is not None: halFastaName = oldHalFastaPath[oldHalFastaPath.find(name):] newHalFastaPath = os.path.join(basePath, halFastaName) exp.setHALFastaPath(newHalFastaPath) # seems to have dissappeared from experiment? #oldMafPath = exp.getMAFPath() #if oldMafPath is not None: # mafName = oldMafPath[oldMafPath.find(name):] # newMafPath = os.path.join(basePath, mafName) # exp.setMAFPath(newMafPath) if exp.getDbType() == "kyoto_tycoon": oldHostName = exp.getDbHost() if oldHostName is not None: newHostName = socket.gethostname() exp.setDbHost(newHostName) system("cp %s %s.old" %(newPath, newPath)) exp.writeXML(newPath) mcProj.writeXML(path)
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outSeqDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed') parser.add_argument("outSeqFile", help = "Path for annotated Seq file output") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="$JOBSTORE", help="jobstore to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest, locally-built docker container " "rather than pulling from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() options.cactusDir = getTempDirectory() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: project.readXML(pjPath) #import the sequences seqIDs = [] for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) logger.info("Setting config id to: %s" % cactusConfigID) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format( experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format( totalTime, totalAppendTime)
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in list(project.inputSequenceMap.items()): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config cactusConfigID = toil.importFile(makeURL(options.configFile)) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set( [mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy( threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] print experimentFilePath experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]") parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") parser.add_argument("--defaultMem", type=float, help="Memory in GB for each job unless otherwise specified") parser.add_argument("--preprocessMem", type=float, help="Memory in GB for each cactus-preprocess job") parser.add_argument("--blastMem", type=float, help="Memory in GB for each cactus-blast job") parser.add_argument("--alignMem", type=float, help="Memory in GB for each cactus-align job") parser.add_argument("--defaultDisk", type=int, help="Disk in GB for each job unless otherwise specified") parser.add_argument("--preprocessDisk", type=int, help="Disk in GB for each cactus-preprocess job") parser.add_argument("--blastDisk", type=int, help="Disk in GB for each cactus-blast job") parser.add_argument("--alignDisk", type=int, help="Disk in GB for each cactus-align job") parser.add_argument("--halAppendDisk", type=int, help="Disk in GB for each halAppendSubtree job") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible in GB for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible in GB for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible in GB for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible in GB for each halAppendSubtree job [default=1]", default=1) options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None if not options.wdl: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMem: if not options.preprocessMem: options.preprocessMem = options.defaultMem if not options.blastMem: options.blastMem = options.defaultMem if not options.alignMem: options.alignMem = options.defaultMem if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(toil_mode=False): parser = ArgumentParser() if toil_mode: Job.Runner.addToilOptions(parser) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries (at top level; use --cactusOpts to set it in nested calls)", default=None) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]", required=toil_mode) if not toil_mode: parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo --retryCount 0", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") if not toil_mode: parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") if not toil_mode: parser.add_argument("--defaultMemory", type=human2bytesN, help="Memory for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessMemory", type=human2bytesN, help="Memory for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastMemory", type=human2bytesN, help="Memory for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignMemory", type=human2bytesN, help="Memory for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") if not toil_mode: parser.add_argument("--defaultDisk", type=human2bytesN, help="Disk for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessDisk", type=human2bytesN, help="Disk for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastDisk", type=human2bytesN, help="Disk for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignDisk", type=human2bytesN, help="Disk for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--halAppendDisk", type=human2bytesN, help="Disk for each halAppendSubtree job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible attempt count for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible attempt count for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible attempt count for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible attempt count for each halAppendSubtree job [default=1]", default=1) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() #todo support root option options.root = None if toil_mode: options.wdl = False options.noLocalInputs = False options.outDir = '.' setupBinaries(options) # need to avoid nested container calls, so set toil-inside-toil jobs to local by default if "--binariesMode" not in options.cactusOptions: options.cactusOptions += " --binariesMode local" if options.jobStore.startswith('aws'): if not options.outHal.startswith('s3://'): raise RuntimeError("--outHal must be s3:// address when using s3 job store") if not has_s3: raise RuntimeError("S3 support requires toil to be installed with [aws]") options.toil = toil_mode if not options.wdl and not options.toil: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMemory: if not options.preprocessMemory: options.preprocessMemory = options.defaultMemory if not options.blastMemory: options.blastMemory = options.defaultMemory if not options.alignMemory: options.alignMemory = options.defaultMemory if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # todo: no reason not to support non-1 batch size, but mirror wdl logic for now if options.toil: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # todo: could also support this assert not options.preprocessOnly # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def createMCProject(tree, experiment, config, options): mcTree = MultiCactusTree(tree, config.getSubtreeSize()) mcTree.nameUnlabeledInternalNodes(config.getDefaultInternalNodePrefix()) mcTree.computeSubtreeRoots() mcProj = MultiCactusProject() mcProj.mcTree = mcTree mcProj.inputSequences = experiment.getSequences()[:] mcProj.outputSequenceDir = experiment.getOutputSequenceDir() if config.getDoSelfAlignment(): mcTree.addSelfEdges() for name in mcProj.mcTree.getSubtreeRootNames(): expPath = "%s/%s/%s_experiment.xml" % (options.path, name, name) mcProj.expMap[name] = os.path.abspath(expPath) alignmentRootId = mcProj.mcTree.getRootId() if options.root is not None: try: alignmentRootId = mcProj.mcTree.getNodeId(options.root) except Exception as e: raise RuntimeError("Specified root name %s not found in tree" % options.root) mcProj.outgroup = None if config.getOutgroupStrategy() == 'greedy': # use the provided outgroup candidates, or use all outgroups # as candidates if none are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyLeaves': # use all leaves as outgroups, unless outgroup candidates are given mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) ogSet = options.outgroupNames if ogSet is None: ogSet = set([mcProj.mcTree.getName(x) for x in mcProj.mcTree.getLeaves()]) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=ogSet, candidateChildFrac=2.0, maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'greedyPreference': # prefer the provided outgroup candidates, if any, but use # other nodes as "filler" if we can't find enough. mcProj.outgroup = GreedyOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, alignmentRootId) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=options.outgroupNames, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) mcProj.outgroup.greedy(threshold=config.getOutgroupThreshold(), candidateSet=None, candidateChildFrac=config.getOutgroupAncestorQualityFraction(), maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() == 'dynamic': # dynamic programming algorithm that exactly optimizes probability # that base in target node aligns to at least one base in the # outgroup set. Caveats are that it only returns leaves, and # the model used for optimization is super naive. Still, it does # some things better than greedy approaches such as properly account # for phylogenetic redundancy, as well as try to factor assembly # size/quality automatically. mcProj.outgroup = DynamicOutgroup() mcProj.outgroup.importTree(mcProj.mcTree, mcProj.getInputSequenceMap(), alignmentRootId, candidateSet=options.outgroupNames) mcProj.outgroup.compute(maxNumOutgroups=config.getMaxNumOutgroups()) elif config.getOutgroupStrategy() != 'none': raise RuntimeError("Could not understand outgroup strategy %s" % config.getOutgroupStrategy()) # if necessary, we reroot the tree at the specified alignment root id. all leaf genomes # that are no longer in the tree, but still used as outgroups, are moved into special fields # so that we can remember to, say, get their paths for preprocessing. specifyAlignmentRoot(mcProj, alignmentRootId) return mcProj
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # import the outgroups outgroupIDs = [] cactus_blast_input = not options.nonBlastInput for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(options.blastOutput) + '.og_fragment_{}'.format(i)) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) except: if cactus_blast_input: raise # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not cactus_blast_input and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not cactus_blast_input: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(options.blastOutput)) try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(options.blastOutput) + '.secondary') except: workFlowArgs.secondaryAlignmentsID = None workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if cactus_blast_input and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(options.blastOutput) + '.ig_coverage_{}'.format(i))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, cactus_blast_input)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job