def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outSeqDir) except: pass if not os.path.isdir(options.outSeqDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir)) if not os.access(options.outSeqDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir)) # hack the configfile to skip preprocessing and write it to the output dir if options.preprocessOnly: config.removePreprocessors() options.configFile = os.path.join(options.outSeqDir, 'config.xml') config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = seqFile.pathMap outSeqFile.outgroups = seqFile.outgroups # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename)) # write the output with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, outSeqFile))
def add_genome_to_seqfile(seqfile_path, fasta_path, name): """ hack the auto-generated minigraph assembly back into the seqfile for future use """ seq_file = SeqFile(seqfile_path) # add the genome to the tree (branching off root) in_tree = False max_id = 0 for node in seq_file.tree.preOrderTraversal(): max_id = max(max_id, node) if seq_file.tree.getName(node) == name: in_tree = True break if not in_tree: label = max_id + 1 seq_file.tree.nxDg.add_edge(0, label) seq_file.tree.setName(label, name) seq_file.tree.setWeight(0, label, SeqFile.branchLen) # add the sequence to the map seq_file.pathMap[name] = fasta_path # write the seq file back to disk with open(seqfile_path, 'w') as seqfile_handle: seqfile_handle.write(str(seq_file))
def get_asms_from_seqfile(seqFile, workflow): """[summary] Args: seqFile ([type]): [description] workflow ([type]): [description] Returns: [type]: [description] """ seqFile = SeqFile(seqFile) seqDict = col.OrderedDict(seqFile.pathMap) print(seqDict) for name, seqURL in seqDict.items(): seqDict[name] = workflow.importFile(makeURL(seqURL)) return seqDict
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def main(): options = get_options() with Toil(options) as workflow: setupBinaries(options) importSingularityImage(options) ## Preprocessing: if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq # Import asms; by default, prepends unique IDs in the technique used in cactus-blast. asms = get_asms_from_seqfile(options.seqFile, workflow) ## Perform alignments: if not workflow.options.restart: alignments = workflow.start( Job.wrapJobFn(run_cactus_reference_align, asms, options.refID, options.debug_export, options.dipcall_bed_filter, options.dipcall_vcf_filter)) else: alignments = workflow.restart() if options.debug_export: # first, ensure the debug dir exists. if not os.path.isdir(options.debug_export_dir): os.mkdir(options.debug_export_dir) print(alignments) # Then return value is: (all_primary, all_secondary, ref_mappings, primary_mappings, secondary_mappings) for asm, mapping_file in alignments[2].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".paf")) for asm, mapping_file in alignments[3].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar")) for asm, mapping_file in alignments[4].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar.secondry")) ## Save alignments: if options.dipcall_vcf_filter: # this is substantially less restrictive than the dipcall_bed_filter. dipcall_filtered = workflow.start( Job.wrapJobFn(apply_dipcall_vcf_filter, alignments[0])) workflow.exportFile(dipcall_filtered, makeURL(options.outputFile)) workflow.exportFile( alignments[1], makeURL(options.outputFile + ".unfiltered.secondary")) else: workflow.exportFile(alignments[0], makeURL(options.outputFile)) workflow.exportFile(alignments[1], makeURL(options.outputFile + ".secondary"))
def runCactusGraphMap(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "refgraph"), "assemblyName", default="__MINIGRAPH_SEQUENCES__") # load the seqfile seqFile = SeqFile(options.seqFile) logger.info("Genomes for graphmap, {}".format(seqFile.pathMap)) if not options.outputFasta and graph_event not in seqFile.pathMap: raise RuntimeError( "{} assembly not found in seqfile so it must be specified with --outputFasta" .format(graph_event)) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} for genome, seq in seqFile.pathMap.items(): if genome != graph_event: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDMap[genome] = toil.importFile(seq) # run the workflow paf_id, gfa_fa_id = toil.start( Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) if gfa_fa_id: toil.exportFile(gfa_fa_id, makeURL(options.outputFasta)) # update the input seqfile (in place!) add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))
def runCactusGraphMapSplit(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the contigs if any ref_contigs = set(options.refContigs) # todo: use import? if options.refContigsFile: with open(options.refContigsFile, 'r') as rc_file: for line in rc_file: if len(line.strip()): ref_contigs.add(line.strip().split()[0]) if options.otherContig: assert options.otherContig not in ref_contigs # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the paf paf_id = toil.importFile(makeURL(options.graphmapPAF)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([ seqFile.tree.getName(node) for node in seqFile.tree.getLeaves() ]) if graph_event not in leaves: raise RuntimeError( "Minigraph name {} not found in seqfile".format( graph_event)) if options.reference and options.reference not in leaves: raise RuntimeError( "Name given with --reference {} not found in seqfile". format(options.reference)) for genome, seq in seqFile.pathMap.items(): if genome in leaves: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_split_workflow, options, config, seqIDMap, gfa_id, options.minigraphGFA, paf_id, options.graphmapPAF, ref_contigs, options.otherContig)) #export the split data export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:], options.outDir, config)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help = "Input Seq file") parser.add_argument("outSeqFile", type=str, nargs='?', default=None, help = "Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--inputNames", nargs='*', help='input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)') parser.add_argument("--inPaths", nargs='*', help='Space-separated list of input fasta paths (to be used in place of --inSeqFile') parser.add_argument("--outPaths", nargs='*', help='Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)') parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError('--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths') elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile or options.inputNames: raise RuntimeError('--inPaths must be used in conjunction with --outPaths and not with --inSeqFile, --outSeqFile nor --inputNames') if len(options.inPaths) != len(options.outPaths): raise RuntimeError('--inPaths and --outPaths must have the same number of arguments') else: raise RuntimeError('--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input') inSeqPaths = [] outSeqPaths = [] # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves()] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError('{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath)] outSeqPaths += [os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath)] else: inSeqPaths += [inPath] outSeqPaths += [outPath] # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("seqFile", help="Input Seq file") parser.add_argument( "outSeqFile", help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() inSeqFile = SeqFile(options.seqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] inSeqPaths = [] outSeqPaths = [] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--clipAlpha", action='store_true', help= 'use dna-brnn instead of lastz for repeatmasking. Also, clip sequence using given minimum length instead of softmasking' ) parser.add_argument( "--ignore", nargs='*', help='Space-separate list of genomes from inSeqFile to ignore', default=[]) parser.add_argument( "--maskPAF", type=str, help= 'Incorporate coverage gaps from given PAF when masking. Only implemented for dna-brnn masking' ) parser.add_argument( "--brnnCores", type=int, help= 'Specify number of cores for each dna-brnn job (overriding default value from the config)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) if options.maskAlpha and options.clipAlpha: raise RuntimeError( '--maskAlpha and --clipAlpha cannot be used together') if options.clipAlpha: options.maskAlpha = True if options.maskPAF and not options.inputNames and not options.inSeqFile: raise RuntimeError( '--maskPAF requires event names specified wither with an input seqfile or with --inputNames' ) if options.ignore and options.clipAlpha is None: raise RuntimeError('--ignore can only be used with --clipAlpha') inSeqPaths = [] outSeqPaths = [] inNames = options.inputNames eventNames = [] #load cactus config configNode = ET.parse(options.configFile).getroot() #we never want to preprocess minigraph sequences graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") options.ignore.append(graph_event) # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName in options.ignore: # "convenience" functionality: we let the --ignore option update the output seqfile # to reflect the fact that we're not touching the original input outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName] continue if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inName)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] eventNames.append(inName) if options.ignore: # see comment above with open(options.outSeqFile, 'w') as outSF: outSF.write(str(outSeqFile)) # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha, clipAlpha=options.clipAlpha, maskPAF=options.maskPAF, inputEventNames=eventNames, brnnCores=options.brnnCores)
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))