def merge_pafs(job, paf_file_id_map): """ merge up some pafs """ paf_paths = [ job.fileStore.readGlobalFile(paf_id) for paf_id in paf_file_id_map.values() ] merged_path = job.fileStore.getLocalTempFile() catFiles(paf_paths, merged_path) return job.fileStore.writeGlobalFile(merged_path)
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def combine_ref_contig_splits(job, original_ref_entry, remap_ref_entry): """ combine fa and paf files for splits for a ref contig """ work_dir = job.fileStore.getLocalTempDir() # combine the event fas for event in original_ref_entry['fa']: if event in remap_ref_entry['fa']: orig_fa_path = os.path.join(work_dir, event + '.fa') remap_fa_path = os.path.join(work_dir, event + '.remap.fa') new_fa_path = os.path.join(work_dir, event + '.combine.fa') job.fileStore.readGlobalFile(original_ref_entry['fa'][event], orig_fa_path, mutable=True) job.fileStore.readGlobalFile(remap_ref_entry['fa'][event], remap_fa_path, mutable=True) catFiles([orig_fa_path, remap_fa_path], new_fa_path) original_ref_entry['fa'][event] = job.fileStore.writeGlobalFile(new_fa_path) os.remove(orig_fa_path) os.remove(remap_fa_path) return original_ref_entry
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id): """ Use rgfa-split to divide a GFA and PAF into chromosomes. The GFA must be in minigraph RGFA output using the desired reference. """ if not paf_ids: # we can bypass when, ex, doing second pass on ambiguous sequences but not are present return [None, None] work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") paf_path = os.path.join(work_dir, "mg.paf") out_prefix = os.path.join(work_dir, "split_") bed_path = os.path.join(work_dir, "mask.bed") log_path = os.path.join(work_dir, "split.log") if (mask_bed_id): job.fileStore.readGlobalFile(mask_bed_id, bed_path) if gfa_id: job.fileStore.readGlobalFile(gfa_id, gfa_path) paf_paths = [] for i, paf_id in enumerate(paf_ids): paf_paths.append( '{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path) job.fileStore.readGlobalFile(paf_id, paf_paths[-1]) if len(paf_paths) > 1: catFiles(paf_paths, paf_path) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # and look up its unique id prefix. this will be needed to pick its contigs out of the list mg_id = graph_event # get the specificity filters query_coverage = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0") small_query_coverage = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0") small_coverage_threshold = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0") query_uniqueness = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0") max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0") amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") cmd = [ 'rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage, '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q', query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path ] if gfa_id: cmd += ['-g', gfa_path, '-G'] if other_contig: cmd += ['-o', other_contig] if reference_event: cmd += ['-r', 'id={}|'.format(reference_event)] if mask_bed_id: cmd += ['-B', bed_path] min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ") if min_mapq: cmd += ['-A', min_mapq] for contig in ref_contigs: cmd += ['-c', contig] cactus_call(parameters=cmd, work_dir=work_dir) output_id_map = {} for out_name in os.listdir(work_dir): file_name, ext = os.path.splitext(out_name) if file_name.startswith(os.path.basename(out_prefix)) and ext in [ ".gfa", ".paf", ".fa_contigs" ]: name = file_name[len(os.path.basename(out_prefix)):] if name not in output_id_map: output_id_map[name] = {} output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile( os.path.join(work_dir, out_name)) return output_id_map, job.fileStore.writeGlobalFile(log_path)
def cat_beds(job, bed_ids): in_beds = [job.fileStore.readGlobalFile(bed_id) for bed_id in bed_ids] out_bed = job.fileStore.getLocalTempFile() catFiles(in_beds, out_bed) return job.fileStore.writeGlobalFile(out_bed)
def runCactusGraphMapSplit(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the contigs if any ref_contigs = set(options.refContigs) # todo: use import? if options.refContigsFile: with open(options.refContigsFile, 'r') as rc_file: for line in rc_file: if len(line.strip()): ref_contigs.add(line.strip().split()[0]) if options.otherContig: assert options.otherContig not in ref_contigs # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the paf paf_id = toil.importFile(makeURL(options.graphmapPAF)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([ seqFile.tree.getName(node) for node in seqFile.tree.getLeaves() ]) if graph_event not in leaves: raise RuntimeError( "Minigraph name {} not found in seqfile".format( graph_event)) if options.reference and options.reference not in leaves: raise RuntimeError( "Name given with --reference {} not found in seqfile". format(options.reference)) for genome, seq in seqFile.pathMap.items(): if genome in leaves: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_split_workflow, options, config, seqIDMap, gfa_id, options.minigraphGFA, paf_id, options.graphmapPAF, ref_contigs, options.otherContig)) #export the split data export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:], options.outDir, config)
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest, locally-built docker container " "rather than pulling from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() options.cactusDir = getTempDirectory() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: project.readXML(pjPath) #import the sequences seqIDs = [] for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) logger.info("Setting config id to: %s" % cactusConfigID) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def runCactusGraphMap(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: paf_id, gfa_fa_id, gaf_id_map = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() #apply the maskfilter override if options.maskFilter is not None: findRequiredNode(configNode, "graphmap").attrib["maskFilter"] = str(options.maskFilter) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) if options.refFromGFA: if options.refFromGFA not in seqFile.pathMap: raise RuntimeError("{}, specified with --refFromGFA, was not found in the seqfile".format(options.refFromGFA)) # we're not going to need the fasta for anything, so forget about it now del seqFile.pathMap[options.refFromGFA] if not options.outputFasta and graph_event not in seqFile.pathMap: raise RuntimeError("{} assembly not found in seqfile so it must be specified with --outputFasta".format(graph_event)) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([seqFile.tree.getName(node) for node in seqFile.tree.getLeaves()]) for genome, seq in seqFile.pathMap.items(): if genome != graph_event and genome in leaves and genome != options.refFromGFA: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow paf_id, gfa_fa_id, gaf_id_map = toil.start(Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) if gfa_fa_id: toil.exportFile(gfa_fa_id, makeURL(options.outputFasta)) #export the gafs if options.outputGAFDir: for event, gaf_id in gaf_id_map.items(): gaf_path = os.path.join(options.outputGAFDir, '{}.gaf.gz'.format(event)) toil.exportFile(gaf_id, makeURL(gaf_path)) # update the input seqfile (in place!) if options.outputFasta: add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id): """ Use rgfa-split to divide a GFA and PAF into chromosomes. The GFA must be in minigraph RGFA output using the desired reference. """ if not paf_ids: # we can bypass when, ex, doing second pass on ambiguous sequences but not are present return [None, None] if not gfa_id and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remap", typeFn=bool, default=False): # also bypass if remapping is off in the config (we know it's the second pass because gfa_id is None) return [None, None] work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") paf_path = os.path.join(work_dir, "mg.paf") out_prefix = os.path.join(work_dir, "split_") bed_path = os.path.join(work_dir, "mask.bed") log_path = os.path.join(work_dir, "split.log") if (mask_bed_id): job.fileStore.readGlobalFile(mask_bed_id, bed_path) if gfa_id: job.fileStore.readGlobalFile(gfa_id, gfa_path) paf_paths = [] for i, paf_id in enumerate(paf_ids): paf_paths.append('{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path) job.fileStore.readGlobalFile(paf_id, paf_paths[-1]) if len(paf_paths) > 1: catFiles(paf_paths, paf_path) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # and look up its unique id prefix. this will be needed to pick its contigs out of the list mg_id = graph_event # get the specificity filters query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0") small_query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0") small_coverage_threshold = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0") query_uniqueness = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0") max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0") amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") cmd = ['rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage, '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q', query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path] if gfa_id: cmd += ['-g', gfa_path, '-G'] if other_contig: cmd += ['-o', other_contig] if reference_event: cmd += ['-r', 'id={}|'.format(reference_event)] if mask_bed_id: cmd += ['-B', bed_path] min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ") if min_mapq: cmd += ['-A', min_mapq] # optional stuff added to second pass: if not gfa_id: remap_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=None) if remap_opts: cmd += remap_opts.split(' ') for contig in ref_contigs: cmd += ['-c', contig] cactus_call(parameters=cmd, work_dir=work_dir) output_id_map = {} for out_name in os.listdir(work_dir): file_name, ext = os.path.splitext(out_name) if file_name.startswith(os.path.basename(out_prefix)) and ext in [".gfa", ".paf", ".fa_contigs"] and \ os.path.isfile(os.path.join(work_dir, file_name + ".fa_contigs")): name = file_name[len(os.path.basename(out_prefix)):] if name not in output_id_map: output_id_map[name] = {} if ext == '.paf': # apply the hacky naming correction so that subpaths have no special characterse in the hal (to make hubs happy) # this gets undone by hal2vg cactus_call(parameters=['sed', '-i', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e', '-e', 's/ /\t/g', os.path.join(work_dir, out_name)]) output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(os.path.join(work_dir, out_name)) return output_id_map, job.fileStore.writeGlobalFile(log_path)
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: outWorkFlowArgs = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))