def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outSeqDir) except: pass if not os.path.isdir(options.outSeqDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir)) if not os.access(options.outSeqDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir)) # hack the configfile to skip preprocessing and write it to the output dir if options.preprocessOnly: config.removePreprocessors() options.configFile = os.path.join(options.outSeqDir, 'config.xml') config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = seqFile.pathMap outSeqFile.outgroups = seqFile.outgroups # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename)) # write the output with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, outSeqFile))
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))