示例#1
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
def clip_vg(job, options, config, vg_path, vg_id):
    """ run clip-vg 
    """
    work_dir = job.fileStore.getLocalTempDir()
    is_decoy = vg_path == options.decoyGraph
    vg_path = os.path.join(work_dir, os.path.basename(vg_path))
    job.fileStore.readGlobalFile(vg_id, vg_path)
    out_path = vg_path + '.clip'

    cmd = ['clip-vg', vg_path, '-f']
    if options.clipLength is not None and not is_decoy:
        cmd += ['-u', str(options.clipLength)]
    for rs in options.rename:
        cmd += ['-r', rs]
    if options.reference:
        cmd += ['-e', options.reference]
    
    if getOptionalAttrib(findRequiredNode(config.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False):
        # our vg file has minigraph sequences -- we'll filter them out, along with any nodes
        # that don't appear in a non-minigraph path
        graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
        cmd += ['-d', graph_event]
        
    # sort while we're at it
    cmd = [cmd, ['vg', 'ids', '-s', '-']]
        
    cactus_call(parameters=cmd, outfile=out_path)

    # worth it
    cactus_call(parameters=['vg', 'validate', out_path])

    return job.fileStore.writeGlobalFile(out_path)
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
示例#4
0
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "cpu",
                                 typeFn=int,
                                 default=1)
    mg_cores = min(mg_cores, cpu_count())

    # doing the paf conversion is more efficient when done separately for each genome.  we can get away
    # with doing this if the universal filter (which needs to process everything at once) is disabled
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter",
                                           float)

    # do the mapping
    gaf_id_map = {}
    paf_id_map = {}

    for event, fa_path_fa_id in fa_id_map.items():
        fa_path = fa_path_fa_id[0]
        fa_id = fa_path_fa_id[1]
        minigraph_map_job = top_job.addChildJobFn(
            minigraph_map_one,
            config,
            event,
            fa_path,
            fa_id,
            gfa_id,
            keep_gaf or not paf_per_genome,
            paf_per_genome,
            # todo: estimate RAM
            cores=mg_cores,
            disk=5 * (fa_id.size + gfa_id.size))
        gaf_id_map[event] = minigraph_map_job.rv(0)
        paf_id_map[event] = minigraph_map_job.rv(1)

    # convert to paf
    if paf_per_genome:
        paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map)
    else:
        paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config,
                                           gaf_id_map)

    if not keep_gaf:
        gaf_id_map = None
    else:
        gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv()

    return paf_job.rv(), gaf_id_map
示例#5
0
def merge_gafs_into_paf(job, config, gaf_file_id_map, gaf_paths=[]):
    """ Merge GAF alignments into a single PAF, applying some filters """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "mz_alignments.paf")
    if not gaf_paths:
        for event, gaf_id in gaf_file_id_map.items():
            gaf_paths.append("{}.gaf".format(event))
            job.fileStore.readGlobalFile(gaf_id,
                                         os.path.join(work_dir, gaf_paths[-1]))

    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    mzgaf2paf_opts = []
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # this must be consistent with prependUniqueIDs() in cactus_workflow.py
    mzgaf2paf_opts += ['-p', 'id={}|'.format(graph_event)]
    mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float)
    if mz_filter:
        mzgaf2paf_opts += ['-u', str(mz_filter)]
    if getOptionalAttrib(xml_node,
                         "nodeBasedUniversal",
                         typeFn=bool,
                         default=False):
        mzgaf2paf_opts += ['-n']
    if getOptionalAttrib(xml_node,
                         "strictUniversal",
                         typeFn=bool,
                         default=False):
        mzgaf2paf_opts += ['-i']
    min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int)
    if min_mz:
        mzgaf2paf_opts += ['-m', str(min_mz)]
    mapq = getOptionalAttrib(xml_node, "minMAPQ", int)
    if mapq:
        mzgaf2paf_opts += ['-q', str(mapq)]
    gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int)
    if gaf_block:
        mzgaf2paf_opts += ['-b', str(gaf_block)]
    gaf_node = getOptionalAttrib(xml_node, "minGAFNodeLength", int)
    if gaf_node:
        mzgaf2paf_opts += ['-s', str(gaf_node)]
    overlap_filter_len = getOptionalAttrib(xml_node,
                                           "minGAFQueryOverlapFilter", int)
    if overlap_filter_len:
        mzgaf2paf_opts += ['-o', str(overlap_filter_len)]

    cactus_call(outfile=paf_path,
                parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts)

    return job.fileStore.writeGlobalFile(paf_path)
示例#6
0
def export_vg(job, hal_id, configWrapper, doVG, doGFA, checkpointInfo=None, resource_spec = False):
    """ use hal2vg to convert the HAL to vg format """

    if not resource_spec:
        # caller couldn't figure out the resrouces from hal_id promise.  do that
        # now and try again
        return job.addChildJobFn(export_vg, hal_id, configWrapper, doVG, doGFA, checkpointInfo,
                                 resource_spec = True,
                                 disk=hal_id.size * 3,
                                 memory=hal_id.size * 10).rv()
        
    work_dir = job.fileStore.getLocalTempDir()
    hal_path = os.path.join(work_dir, "out.hal")
    job.fileStore.readGlobalFile(hal_id, hal_path)
    
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
    hal2vg_opts = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "hal2vgOptions", default="")
    if hal2vg_opts:
        hal2vg_opts = hal2vg_opts.split(' ')
    else:
        hal2vg_opts = []
    ignore_events = []
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False):
        ignore_events.append(graph_event)
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeAncestor", typeFn=bool, default=False):
        ignore_events.append(configWrapper.getDefaultInternalNodePrefix() + '0')
    if ignore_events:
        hal2vg_opts += ['--ignoreGenomes', ','.join(ignore_events)]
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "prependGenomeNames", typeFn=bool, default=True):
        hal2vg_opts += ['--onlySequenceNames']

    vg_path = os.path.join(work_dir, "out.vg")
    cmd = ['hal2vg', hal_path] + hal2vg_opts

    cactus_call(parameters=cmd, outfile=vg_path)

    if checkpointInfo:
        write_s3(vg_path, os.path.splitext(checkpointInfo[1])[0] + '.vg', region=checkpointInfo[0])

    gfa_path = os.path.join(work_dir, "out.gfa.gz")
    if doGFA:
        gfa_cmd = [ ['vg', 'view', '-g', vg_path], ['gzip'] ]
        cactus_call(parameters=gfa_cmd, outfile=gfa_path)

        if checkpointInfo:
            write_s3(gfa_path, os.path.splitext(checkpointInfo[1])[0] + '.gfa.gz', region=checkpointInfo[0])

    vg_id = job.fileStore.writeGlobalFile(vg_path) if doVG else None
    gfa_id = job.fileStore.writeGlobalFile(gfa_path) if doGFA else None

    return vg_id, gfa_id
示例#7
0
def merge_gafs_into_paf(job, config, gaf_file_ids):
    """ Merge GAF alignments into a single PAF, applying some filters """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "mz_alignments.paf")
    gaf_paths = []
    for i, gaf_id in enumerate(gaf_file_ids):
        gaf_paths.append("mz_alignment_{}.gaf".format(i))
        job.fileStore.readGlobalFile(gaf_id,
                                     os.path.join(work_dir, gaf_paths[-1]))

    xml_node = findRequiredNode(config.xmlRoot, "refgraph")
    mzgaf2paf_opts = []
    mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float)
    if mz_filter:
        mzgaf2paf_opts += ['-u', str(mz_filter)]
    min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int)
    if min_mz:
        mzgaf2paf_opts += ['-m', str(min_mz)]
    mapq = getOptionalAttrib(xml_node, "minMAPQ", int)
    if mapq:
        mzgaf2paf_opts += ['-q', str(mapq)]
    gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int)
    if gaf_block:
        mzgaf2paf_opts += ['-b', str(gaf_block)]

    cactus_call(work_dir=work_dir,
                outfile=paf_path,
                parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts)

    # these are big, get rid of them as soon as we can (which is now)
    for gaf_id in gaf_file_ids:
        job.fileStore.deleteGlobalFile(gaf_id)

    return job.fileStore.writeGlobalFile(paf_path)
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map):
    """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and 
    add them to the events where possible"""

    # can't do anything without a reference
    if not options.reference:
        logger.info("Skipping minimap2 fallback as --reference was not specified")
        return None, None
    # todo: also skip if no ambgious sequences
    
    ref_path, ref_id = seqIDMap[options.reference]
    mm_mem = ref_id.size * 5
    if seqIDMap[options.reference][0].endswith('.gz'):
        mm_mem *= 4
    mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem)
    mm_map_root_job = Job()
    mm_index_job.addFollowOn(mm_map_root_job)
    
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    if amb_name not in output_id_map:
        logger.info("Skipping minmap2 fallback as no ambigious sequences found")
        return None, None

    # map every ambgiuous sequence against the reference in parallel
    paf_ids = []
    ambiguous_seq_id_map = {}
    for event, fa_id in output_id_map[amb_name]['fa'].items():
        paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0],
                                                disk=ref_id.size * 3, memory=mm_mem)
        paf_ids.append(paf_job.rv())
        ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id)

    return paf_ids, ambiguous_seq_id_map
示例#9
0
 def substituteAllDivergenceContolledParametersWithLiterals(self, maxDivergence):
     constants = findRequiredNode(self.xmlRoot, "constants")
     divergences = constants.find("divergences")
     messages = []
     if divergences != None:
         useDefaultDivergences = getOptionalAttrib(divergences, attribName="useDefault", typeFn=bool, default=False)
         def replaceAllDivergenceParameters(node):
             for child in node:
                 if child.tag == "divergence":
                     attribName = child.attrib["argName"]
                     arg = child.attrib["default"]
                     divergence = sys.maxint
                     if not useDefaultDivergences:
                         for i in child.attrib.keys():
                             if i in divergences.attrib.keys():
                                 j = float(divergences.attrib[i])
                                 if j < divergence and j >= maxDivergence:
                                     arg = child.attrib[i]
                                     divergence = j
                     messages.append("Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)" % (attribName, arg, node.tag, divergence, maxDivergence, useDefaultDivergences))
                     node.attrib[attribName] = arg
                 else:
                     replaceAllDivergenceParameters(child)
         replaceAllDivergenceParameters(self.xmlRoot)
     return messages
示例#10
0
    def substituteAllDivergenceContolledParametersWithLiterals(
            self, maxDivergence):
        constants = findRequiredNode(self.xmlRoot, "constants")
        divergences = constants.find("divergences")
        messages = []
        if divergences != None:
            useDefaultDivergences = getOptionalAttrib(divergences,
                                                      attribName="useDefault",
                                                      typeFn=bool,
                                                      default=False)

            def replaceAllDivergenceParameters(node):
                for child in node:
                    if child.tag == "divergence":
                        attribName = child.attrib["argName"]
                        arg = child.attrib["default"]
                        divergence = sys.maxsize
                        if not useDefaultDivergences:
                            for i in list(child.attrib.keys()):
                                if i in list(divergences.attrib.keys()):
                                    j = float(divergences.attrib[i])
                                    if j < divergence and j >= maxDivergence:
                                        arg = child.attrib[i]
                                        divergence = j
                        messages.append(
                            "Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)"
                            % (attribName, arg, node.tag, divergence,
                               maxDivergence, useDefaultDivergences))
                        node.attrib[attribName] = arg
                    else:
                        replaceAllDivergenceParameters(child)

            replaceAllDivergenceParameters(self.xmlRoot)
        return messages
示例#11
0
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map):
    """ combine the output of two runs of gather_fas.  the first is the contigs determined by minigraph,
    the second from remapping the ambigious contigs with minimap2 """

    root_job = Job()
    job.addChild(root_job)

    # no ambiguous remappings, nothing to do
    if not remap_id_map or len(remap_id_map) == 0:
        return original_id_map

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")

    # note: we're not handling case where 100% of a given reference contigs are ambiguous
    for ref_contig in original_id_map:
        if ref_contig == amb_name:
            # for ambiguous sequence, we overwrite and don't combine
            if ref_contig in remap_id_map:
                original_id_map[ref_contig] = remap_id_map[ref_contig]
            else:
                original_id_map[ref_contig] = None
        elif ref_contig in remap_id_map:
            total_size = 0
            for event in original_id_map[ref_contig]['fa']:
                total_size += original_id_map[ref_contig]['fa'][event].size
                if event in remap_id_map[ref_contig]['fa']:
                    total_size += remap_id_map[ref_contig]['fa'][event].size
            original_id_map[ref_contig] = root_job.addChildJobFn(
                combine_ref_contig_splits,
                original_id_map[ref_contig],
                remap_id_map[ref_contig],
                disk=total_size * 4).rv()

    return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map,
                                     original_id_map, remap_id_map, amb_name,
                                     graph_event).rv()
示例#12
0
 def substituteAllPredefinedConstantsWithLiterals(self):
     constants = findRequiredNode(self.xmlRoot, "constants")
     defines = constants.find("defines")
     def replaceAllConstants(node, defines):
         for attrib in node.attrib:
             if node.attrib[attrib] in defines.attrib:
                 node.attrib[attrib] = defines.attrib[node.attrib[attrib]]
         for child in node:
             replaceAllConstants(child, defines)
     if defines != None:
         replaceAllConstants(self.xmlRoot, defines)
         constants.remove(defines)
示例#13
0
def minigraph_map_one(job, config, event_name, fa_path, fa_file_id, gfa_file_id, gaf_output, paf_output):
    """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    fa_dir = job.fileStore.getLocalTempDir()
    fa_path = os.path.join(fa_dir, os.path.basename(fa_path))
    gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name))
    
    job.fileStore.readGlobalFile(gfa_file_id, gfa_path)
    job.fileStore.readGlobalFile(fa_file_id, fa_path)

    if fa_path.endswith('.gz'):
        fa_path = fa_path[:-3]
        cactus_call(parameters = ['gzip', '-d', '-c', fa_path + '.gz'], outfile=fa_path)

    # prepend the unique id before mapping so the GAF has cactus-compatible event names
    fa_path = prependUniqueIDs({event_name : fa_path}, work_dir, eventNameAsID=True)[event_name]

    # parse options from the config
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="")     
    opts_list = minigraph_opts.split()
    # add required options if not present
    if "-S" not in opts_list:
        opts_list += ["-S"]
    if "--write-mz" not in opts_list:
        opts_list += ["--write-mz"]
    if "-t" not in opts_list:
        opts_list += ["-t", str(int(job.cores))]

    cmd = ["minigraph",
           os.path.basename(gfa_path),
           os.path.basename(fa_path),
           "-o", os.path.basename(gaf_path)] + opts_list

    mask_filter = getOptionalAttrib(xml_node, "maskFilter", int, default=-1)
    if mask_filter >= 0:
        cmd[2] = '-'
        cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path), '-m', str(mask_filter)], cmd]
    
    cactus_call(work_dir=work_dir, parameters=cmd)

    paf_id, gaf_id = None, None
    if paf_output:
        # optional gaf->paf step.  we are not piping directly out of minigraph because mzgaf2paf's overlap filter
        # (which is usually on) requires 2 passes so it won't read stdin when it's enabled
        paf_id =  merge_gafs_into_paf(job, config, None, [gaf_path])
    if gaf_output:
        gaf_id = job.fileStore.writeGlobalFile(gaf_path)

    return gaf_id, paf_id
示例#14
0
    def substituteAllPredefinedConstantsWithLiterals(self):
        constants = findRequiredNode(self.xmlRoot, "constants")
        defines = constants.find("defines")

        def replaceAllConstants(node, defines):
            for attrib in node.attrib:
                if node.attrib[attrib] in defines.attrib:
                    node.attrib[attrib] = defines.attrib[node.attrib[attrib]]
            for child in node:
                replaceAllConstants(child, defines)

        if defines != None:
            replaceAllConstants(self.xmlRoot, defines)
            constants.remove(defines)
示例#15
0
def preprocess_input_sequences(job,
                               configWrapper,
                               project,
                               cactusWorkflowArguments,
                               pafMaskFilter=None,
                               referenceEvent=None):
    """ update the workflow arguments in place with unzipped version of any input fastas whose paths 
    end in .gz, 
    if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed.
    Note that the beds will need unique ids prepended just like the fastas...
    """
    head_job = Job()
    job.addChild(head_job)
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    mask_bed_ids = {}
    events = []
    updated_seq_ids = []
    for g, seqID in ingroupsAndOriginalIDs:
        zipped = project.inputSequenceMap[g].endswith('.gz')
        do_filter = pafMaskFilter and g not in [graph_event, referenceEvent]
        if zipped or do_filter:
            prepend_id_job = head_job.addChildJobFn(
                preprocess_input_sequence, g, seqID,
                project.inputSequenceMap[g], pafMaskFilter)
            updated_seq_id, mask_bed_id = prepend_id_job.rv(
                0), prepend_id_job.rv(1)
            if zipped:
                events.append(g)
                updated_seq_ids.append(updated_seq_id)
            if do_filter:
                mask_bed_ids[g] = mask_bed_id

    return head_job.addFollowOnJobFn(
        resolve_id_promises, events, updated_seq_ids,
        cactusWorkflowArguments).rv(), mask_bed_ids
示例#16
0
def minigraph_map_one(job, config, event_name, fa_file_id, gfa_file_id,
                      ignore_softmasked):
    """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "minigraph.gfa")
    fa_path = os.path.join(work_dir, "{}.fa".format(event_name))
    gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name))

    job.fileStore.readGlobalFile(gfa_file_id, gfa_path)
    job.fileStore.readGlobalFile(fa_file_id, fa_path)

    # parse options from the config
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    minigraph_opts = getOptionalAttrib(xml_node,
                                       "minigraphMapOptions",
                                       str,
                                       default="")
    opts_list = minigraph_opts.split()
    # add required options if not present
    if "-S" not in opts_list:
        opts_list += ["-S"]
    if "--write-mz" not in opts_list:
        opts_list += ["--write-mz"]
    if "-t" not in opts_list:
        opts_list += ["-t", str(int(job.cores))]

    cmd = [
        "minigraph",
        os.path.basename(gfa_path),
        os.path.basename(fa_path), "-o",
        os.path.basename(gaf_path)
    ] + opts_list

    if ignore_softmasked:
        cmd[2] = '-'
        cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path)], cmd]

    # todo: pipe into gzip directly as these files can be huge!!! (requires gzip support be added to mzgaf2paf)
    cactus_call(work_dir=work_dir, parameters=cmd)

    return job.fileStore.writeGlobalFile(gaf_path)
示例#17
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("inSeqFile",
                        type=str,
                        nargs='?',
                        default=None,
                        help="Input Seq file")
    parser.add_argument(
        "outSeqFile",
        type=str,
        nargs='?',
        default=None,
        help="Output Seq file (ex generated with cactus-prepare)")
    parser.add_argument("--configFile",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--inputNames",
        nargs='*',
        help=
        'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)'
    )
    parser.add_argument(
        "--inPaths",
        nargs='*',
        help=
        'Space-separated list of input fasta paths (to be used in place of --inSeqFile'
    )
    parser.add_argument(
        "--outPaths",
        nargs='*',
        help=
        'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)'
    )
    parser.add_argument("--maskAlpha",
                        action='store_true',
                        help='Use dna-brnn instead of lastz for repeatmasking')
    parser.add_argument(
        "--clipAlpha",
        action='store_true',
        help=
        'use dna-brnn instead of lastz for repeatmasking.  Also, clip sequence using given minimum length instead of softmasking'
    )
    parser.add_argument(
        "--ignore",
        nargs='*',
        help='Space-separate list of genomes from inSeqFile to ignore',
        default=[])
    parser.add_argument(
        "--maskPAF",
        type=str,
        help=
        'Incorporate coverage gaps from given PAF when masking.  Only implemented for dna-brnn masking'
    )
    parser.add_argument(
        "--brnnCores",
        type=int,
        help=
        'Specify number of cores for each dna-brnn job (overriding default value from the config)'
    )
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # we have two modes: operate directly on paths or rely on the seqfiles.  they cannot be mixed
    if options.inSeqFile or options.outSeqFile:
        if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths:
            raise RuntimeError(
                '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths'
            )
    elif options.inPaths or options.outPaths:
        if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile:
            raise RuntimeError(
                '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile'
            )
        if len(options.inPaths) != len(options.outPaths):
            raise RuntimeError(
                '--inPaths and --outPaths must have the same number of arguments'
            )
    else:
        raise RuntimeError(
            '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input'
        )
    if options.maskAlpha and options.clipAlpha:
        raise RuntimeError(
            '--maskAlpha and --clipAlpha cannot be used together')
    if options.clipAlpha:
        options.maskAlpha = True
    if options.maskPAF and not options.inputNames and not options.inSeqFile:
        raise RuntimeError(
            '--maskPAF requires event names specified wither with an input seqfile or with --inputNames'
        )
    if options.ignore and options.clipAlpha is None:
        raise RuntimeError('--ignore can only be used with --clipAlpha')

    inSeqPaths = []
    outSeqPaths = []
    inNames = options.inputNames
    eventNames = []

    #load cactus config
    configNode = ET.parse(options.configFile).getroot()
    #we never want to preprocess minigraph sequences
    graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    options.ignore.append(graph_event)

    # mine the paths out of the seqfiles
    if options.inSeqFile:
        inSeqFile = SeqFile(options.inSeqFile)
        outSeqFile = SeqFile(options.outSeqFile)

        if not inNames:
            inNames = [
                inSeqFile.tree.getName(node)
                for node in inSeqFile.tree.getLeaves()
            ]

        for inName in inNames:
            if inName in options.ignore:
                # "convenience" functionality: we let the --ignore option update the output seqfile
                # to reflect the fact that we're not touching the original input
                outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName]
                continue
            if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap:
                raise RuntimeError(
                    '{} not present in input and output Seq files'.format(
                        inName))
            inPath = inSeqFile.pathMap[inName]
            outPath = outSeqFile.pathMap[inName]
            if os.path.isdir(inPath):
                try:
                    os.makedirs(outPath)
                except:
                    pass
                assert os.path.isdir(inPath) == os.path.isdir(outPath)
                inSeqPaths += [
                    os.path.join(inPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
                outSeqPaths += [
                    os.path.join(outPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
            else:
                inSeqPaths += [inPath]
                outSeqPaths += [outPath]
            eventNames.append(inName)

        if options.ignore:
            # see comment above
            with open(options.outSeqFile, 'w') as outSF:
                outSF.write(str(outSeqFile))

    # we got path names directly from the command line
    else:
        inSeqPaths = options.inPaths
        outSeqPaths = options.outPaths

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=None,
                      configFile=options.configFile,
                      inputSequences=inSeqPaths,
                      toil=toil,
                      restart=options.restart,
                      outputSequences=outSeqPaths,
                      maskAlpha=options.maskAlpha,
                      clipAlpha=options.clipAlpha,
                      maskPAF=options.maskPAF,
                      inputEventNames=eventNames,
                      brnnCores=options.brnnCores)
示例#18
0
 def turnAllModesOn(self):
     """Switches on check, normalisation etc. to use when debugging/testing
     """
     findRequiredNode(self.xmlRoot, "check").attrib["runCheck"] = "1"
     findRequiredNode(self.xmlRoot, "normal").attrib["iterations"] = "2"
示例#19
0
def combine_paf_splits(job, options, config, seq_id_map, original_id_map, orig_amb_entry,
                       remap_id_map, amb_name, graph_event):
    """ pull out PAF entries for contigs that were ambiguous in the first round but assigned by minimap2
    then add them to the chromosome PAFs     
    """

    if amb_name not in original_id_map:
        return original_id_map

    work_dir = job.fileStore.getLocalTempDir()
    amb_paf_path = os.path.join(work_dir, 'amb.paf')
    job.fileStore.readGlobalFile(orig_amb_entry['paf'], amb_paf_path, mutable=True)

    # use_minimap_paf = True: return the minimap2 mappings for ambiguous contigs in final output
    # use_minimap_paf = False: ambiguous contigs are assigned to chromosomes base on minimap2, but their minigraph 
    #                          alignments are returned in the final paf"""
    use_minimap_paf = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "useMinimapPAF",
                                        typeFn=bool, default=False)

    # it's simpler not to support both codepaths right now.  the main issue is that -u can cause contigs to be split
    # in which case they get renamed, so pulling them in from the existing PAF would require a pass to resolove all the
    # offsets
    if not use_minimap_paf and '-u' in getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions",
                                                         default=""):
        raise RuntimeError("useMinimapPAF must be set when -u present in remapSplitOptions")

    for ref_contig in remap_id_map.keys():
        if ref_contig != amb_name and ref_contig in original_id_map:

            # make a set of all minigraph nodes in this contig
            mg_fa_path = os.path.join(work_dir, '{}.{}.fa'.format(graph_event, ref_contig))
            if seq_id_map[graph_event][0].endswith('.gz'):
                mg_fa_path += '.gz'
            mg_contigs_path = os.path.join(work_dir, '{}.contigs'.format(graph_event))
            job.fileStore.readGlobalFile(original_id_map[ref_contig]['fa'][graph_event], mg_fa_path, mutable=True)
            cactus_call(parameters=[['zcat' if mg_fa_path.endswith('.gz') else 'cat', mg_fa_path],
                                    ['grep', '>'], ['cut', '-c', '2-']], outfile=mg_contigs_path)
            mg_contig_set = set()
            with open(mg_contigs_path, 'r') as mg_contigs_file:
                for line in mg_contigs_file:
                    mg_contig_set.add('id={}|{}'.format(graph_event, line.strip()))
            os.remove(mg_fa_path)
            os.remove(mg_contigs_path)

            #make a set of all the query contigs that we want to remove from ambiguous and add to this contig
            query_contig_set = set()

            for event in remap_id_map[ref_contig]['fa']:
                if event != graph_event and remap_id_map[ref_contig]['fa'][event].size > 0:
                    # read the contigs assigned to this sample for this chromosome by scanning fasta headers
                    tmp_fa_path = os.path.join(work_dir, 'tmp.fa')
                    if seq_id_map[event][0].endswith('.gz'):
                        tmp_fa_path += '.gz'
                    if os.path.isfile(tmp_fa_path):
                        os.remove(tmp_fa_path)
                    job.fileStore.readGlobalFile(remap_id_map[ref_contig]['fa'][event], tmp_fa_path, mutable=True)
                    contigs_path = os.path.join(work_dir, '{}.contigs'.format(event))
                    cactus_call(parameters=[['zcat' if tmp_fa_path.endswith('.gz') else 'cat', tmp_fa_path],
                                            ['grep', '>'], ['cut', '-c', '2-']], outfile=contigs_path)
                    # add them to the grep
                    with open(contigs_path, 'r') as contigs_file:
                        for line in contigs_file:
                            query_contig_set.add('id={}|{}'.format(event, line.strip()))

            if query_contig_set:
                # pull out remapped contigs into this path
                new_contig_path = os.path.join(work_dir, '{}.remap.paf'.format(ref_contig))
                do_append = False
                if ref_contig in original_id_map and 'paf' in original_id_map[ref_contig]:
                    job.fileStore.readGlobalFile(original_id_map[ref_contig]['paf'], new_contig_path, mutable=True)
                    do_append = True
                    
                # make an updated ambiguous paf with the contigs removed in this path                
                temp_contig_path = os.path.join(work_dir, amb_paf_path + '.temp.remove')                    
                with open(new_contig_path, 'a' if do_append else 'w') as new_contig_file, \
                     open(amb_paf_path, 'r') as amb_paf_file, \
                     open(temp_contig_path, 'w') as temp_contig_file:
                    # scan the ambgiuous paf from minigraph
                    for line in amb_paf_file:
                        toks = line.split('\t')
                        if len(toks) > 5 and toks[0] in query_contig_set:
                            if toks[5] in mg_contig_set and not use_minimap_paf:
                                # move the contig if both the query and target belong to reference contig
                                new_contig_file.write(line)
                        else:
                            # leave the contig in ambiguous
                            temp_contig_file.write(line)
                    if use_minimap_paf:
                        # if we're taking the contigs from minigraph, append them here (as they weren't added in
                        # the loop above)
                        minimap_paf_path = os.path.join(work_dir, '{}.minimap.paf'.format(ref_contig))
                        job.fileStore.readGlobalFile(remap_id_map[ref_contig]['paf'], minimap_paf_path)
                        with open(minimap_paf_path, 'r') as minimap_paf_file:
                            for line in minimap_paf_file:
                                toks = line.split('\t')
                                if len(toks) > 5:
                                    toks[5] = 'id={}|{}'.format(options.reference, toks[5])
                                new_contig_file.write('\t'.join(toks))
                        
                # update the map
                original_id_map[ref_contig]['paf'] = job.fileStore.writeGlobalFile(new_contig_path)
                # update the ambigious paf
                cactus_call(parameters=['mv', temp_contig_path, amb_paf_path])

    # update the ambiguous paf
    if amb_name in original_id_map and original_id_map[amb_name]:
        original_id_map[amb_name]['paf'] = job.fileStore.writeGlobalFile(amb_paf_path)
    else:
        assert os.path.getsize(amb_paf_path) == 0
    
    return original_id_map
示例#20
0
def split_gfa(job, config, gfa_id, paf_id, ref_contigs, other_contig,
              reference_event):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")

    job.fileStore.readGlobalFile(gfa_id, gfa_path)
    job.fileStore.readGlobalFile(paf_id, paf_path)

    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                       "minQueryCoverage",
                                       default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                         "minQueryUniqueness",
                                         default="0")
    amb_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                   "graphmap_split"),
                                  "ambiguousName",
                                  default="_AMBIGUOUS_")

    cmd = [
        'rgfa-split', '-i', 'id={}|'.format(mg_id), '-G', '-g', gfa_path, '-p',
        paf_path, '-b', out_prefix, '-n', query_coverage, '-Q',
        query_uniqueness, '-a', amb_event
    ]
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]

    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [
                ".gfa", ".paf", ".fa_contigs"
        ]:
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, out_name))

    return output_id_map
示例#21
0
 def getDoTrimStrategy(self):
     trimBlastNode = findRequiredNode(self.xmlRoot, "trimBlast")
     if "doTrimStrategy" in trimBlastNode.attrib:
         return trimBlastNode.attrib["doTrimStrategy"] == "1"
     return False
示例#22
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    if not options.wdl:
        # prepare output sequence directory
        # todo: support remote (ie s3) output directory
        try:
            os.makedirs(options.outDir)
        except:
            pass
        if not os.path.isdir(options.outDir):
            raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir))
        if not os.access(options.outDir, os.W_OK):
            logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir))

    if options.preprocessOnly or options.gpu:
        if options.preprocessOnly:
            # hack the configfile to skip preprocessing and write it to the output dir
            config.removePreprocessors()
        if options.gpu:
            # hack the configfile to toggle on gpu lastz
            cafNode = findRequiredNode(config.xmlRoot, "caf")
            cafNode.attrib["gpuLastz"] = "true"
            # realigning doesn't mix well with lastz so we make sure it's off
            # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271
            cafNode.attrib["realign"] = "0"
        options.configFile = os.path.join(options.outDir, 'config-prepared.xml')
        sys.stderr.write("configuration saved in {}\n".format(options.configFile))
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl:
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap)
    outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups)

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename))
            if options.wdl:
                # uniquify name in wdl to prevent collisions
                outSeqFile.pathMap[name] += '.pp'

    # write the output
    if options.outSeqFile:
        with open(options.outSeqFile, 'w') as out_sf:
            out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, seqFile, outSeqFile))
示例#23
0
def runCactusGraphMap(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "refgraph"),
                                            "assemblyName",
                                            default="__MINIGRAPH_SEQUENCES__")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            logger.info("Genomes for graphmap, {}".format(seqFile.pathMap))

            if not options.outputFasta and graph_event not in seqFile.pathMap:
                raise RuntimeError(
                    "{} assembly not found in seqfile so it must be specified with --outputFasta"
                    .format(graph_event))

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            for genome, seq in seqFile.pathMap.items():
                if genome != graph_event:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    seqIDMap[genome] = toil.importFile(seq)

            # run the workflow
            paf_id, gfa_fa_id = toil.start(
                Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap,
                              gfa_id, graph_event))

        #export the paf
        toil.exportFile(paf_id, makeURL(options.outputPAF))
        if gfa_fa_id:
            toil.exportFile(gfa_fa_id, makeURL(options.outputFasta))

        # update the input seqfile (in place!)
        add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta),
                              graph_event)
示例#24
0
 def turnAllModesOn(self):
     """Switches on check, normalisation etc. to use when debugging/testing
     """
     findRequiredNode(self.xmlRoot, "check").attrib["runCheck"] = "1"
     findRequiredNode(self.xmlRoot, "normal").attrib["iterations"] = "2"
示例#25
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job
示例#26
0
 def disableRecoverableChains(self):
     """Make sure the filter is off in caf """
     cafNode = findRequiredNode(self.xmlRoot, "caf")
     cafNode.attrib["removeRecoverableChains"] = "0"
示例#27
0
 def disableCafMegablockFilter(self):
     """Make sure the filter is off in caf """
     cafNode = findRequiredNode(self.xmlRoot, "caf")
     cafNode.attrib["minimumBlockHomologySupport"] = "0"
     cafNode.attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
示例#28
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig,
              reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)

    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append(
            '{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)

    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                       "minQueryCoverage",
                                       default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                             "minQuerySmallCoverage",
                                             default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                                 "minQuerySmallThreshold",
                                                 default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                         "minQueryUniqueness",
                                         default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                 "graphmap_split"),
                                "maxGap",
                                default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")

    cmd = [
        'rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage,
        '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q',
        query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path
    ]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]

    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [
                ".gfa", ".paf", ".fa_contigs"
        ]:
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, out_name))

    return output_id_map, job.fileStore.writeGlobalFile(log_path)
示例#29
0
 def getDoTrimStrategy(self):
     trimBlastNode = findRequiredNode(self.xmlRoot, "trimBlast")
     if "doTrimStrategy" in trimBlastNode.attrib:
         return trimBlastNode.attrib["doTrimStrategy"] == "1"
     return False
示例#30
0
def export_split_data(toil, input_seq_id_map, output_id_map, split_log_ids,
                      output_dir, config):
    """ download all the split data locally """

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")

    chrom_file_map = {}

    for ref_contig in output_id_map.keys():
        ref_contig_path = os.path.join(output_dir, ref_contig)
        if not os.path.isdir(
                ref_contig_path) and not ref_contig_path.startswith('s3://'):
            os.makedirs(ref_contig_path)

        # GFA: <output_dir>/<contig>/<contig>.gfa
        if 'gfa' in output_id_map[ref_contig]:
            # we do this check because no gfa made for ambiguous sequences "contig"
            toil.exportFile(
                output_id_map[ref_contig]['gfa'],
                makeURL(
                    os.path.join(ref_contig_path,
                                 '{}.gfa'.format(ref_contig))))

        # PAF: <output_dir>/<contig>/<contig>.paf
        paf_path = os.path.join(ref_contig_path, '{}.paf'.format(ref_contig))
        toil.exportFile(output_id_map[ref_contig]['paf'], makeURL(paf_path))

        # Fasta: <output_dir>/<contig>/fasta/<event>_<contig>.fa ..
        seq_file_map = {}
        for event, ref_contig_fa_id in output_id_map[ref_contig]['fa'].items():
            fa_base = os.path.join(ref_contig_path, 'fasta')
            if not os.path.isdir(fa_base) and not fa_base.startswith('s3://'):
                os.makedirs(fa_base)
            fa_path = makeURL(
                os.path.join(fa_base, '{}_{}.fa'.format(event, ref_contig)))
            if input_seq_id_map[event][0].endswith('.gz'):
                fa_path += '.gz'
            seq_file_map[event] = fa_path
            toil.exportFile(ref_contig_fa_id, fa_path)

        # Seqfile: <output_dir>/seqfiles/<contig>.seqfile
        seq_file_path = os.path.join(output_dir, 'seqfiles',
                                     '{}.seqfile'.format(ref_contig))
        if seq_file_path.startswith('s3://'):
            seq_file_temp_path = getTempFile()
        else:
            seq_file_temp_path = seq_file_path
            if not os.path.isdir(os.path.dirname(seq_file_path)):
                os.makedirs(os.path.dirname(seq_file_path))
        with open(seq_file_temp_path, 'w') as seq_file:
            for event, fa_path in seq_file_map.items():
                # cactus can't handle empty fastas.  if there are no sequences for a sample for this
                # contig, just don't add it.
                if output_id_map[ref_contig]['fa'][event].size > 0:
                    seq_file.write('{}\t{}\n'.format(event, fa_path))
        if seq_file_path.startswith('s3://'):
            write_s3(seq_file_temp_path, seq_file_path)

        # Top-level seqfile
        chrom_file_map[ref_contig] = seq_file_path, paf_path

    # Chromfile : <coutput_dir>/chromfile.txt
    chrom_file_path = os.path.join(output_dir, 'chromfile.txt')
    if chrom_file_path.startswith('s3://'):
        chrom_file_temp_path = getTempFile()
    else:
        chrom_file_temp_path = chrom_file_path
    with open(chrom_file_temp_path, 'w') as chromfile:
        for ref_contig, seqfile_paf in chrom_file_map.items():
            if ref_contig != amb_name:
                seqfile, paf = seqfile_paf[0], seqfile_paf[1]
                if seqfile.startswith('s3://'):
                    # no use to have absolute s3 reference as cactus-align requires seqfiles passed locally
                    seqfile = 'seqfiles/{}'.format(os.path.basename(seqfile))
                chromfile.write('{}\t{}\t{}\n'.format(ref_contig, seqfile,
                                                      paf))
    if chrom_file_path.startswith('s3://'):
        write_s3(chrom_file_temp_path, chrom_file_path)

    toil.exportFile(split_log_ids[0],
                    makeURL(os.path.join(output_dir, 'minigraph.split.log')))
    if split_log_ids[1]:
        toil.exportFile(
            split_log_ids[1],
            makeURL(os.path.join(output_dir, 'minimap2.ambiguous.split.log')))
示例#31
0
def runCactusGraphMapSplit(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the contigs if any
            ref_contigs = set(options.refContigs)
            # todo: use import?
            if options.refContigsFile:
                with open(options.refContigsFile, 'r') as rc_file:
                    for line in rc_file:
                        if len(line.strip()):
                            ref_contigs.add(line.strip().split()[0])

            if options.otherContig:
                assert options.otherContig not in ref_contigs

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "graphmap"),
                                            "assemblyName",
                                            default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the paf
            paf_id = toil.importFile(makeURL(options.graphmapPAF))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([
                seqFile.tree.getName(node)
                for node in seqFile.tree.getLeaves()
            ])

            if graph_event not in leaves:
                raise RuntimeError(
                    "Minigraph name {} not found in seqfile".format(
                        graph_event))
            if options.reference and options.reference not in leaves:
                raise RuntimeError(
                    "Name given with --reference {} not found in seqfile".
                    format(options.reference))

            for genome, seq in seqFile.pathMap.items():
                if genome in leaves:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_split_workflow, options, config,
                              seqIDMap, gfa_id, options.minigraphGFA, paf_id,
                              options.graphmapPAF, ref_contigs,
                              options.otherContig))

        #export the split data
        export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:],
                          options.outDir, config)
示例#32
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    if not gfa_id and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remap", typeFn=bool, default=False):
        # also bypass if remapping is off in the config (we know it's the second pass because gfa_id is None)
        return [None, None]
    
    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)
        
    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append('{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)
    
    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    cmd = ['rgfa-split',
           '-p', paf_path,
           '-b', out_prefix,
           '-n', query_coverage,
           '-N', small_query_coverage,
           '-T', small_coverage_threshold,
           '-Q', query_uniqueness,
           '-P', max_gap,
           '-a', amb_name,
           '-L', log_path]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]
    # optional stuff added to second pass:
    if not gfa_id:
        remap_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=None)
        if remap_opts:
            cmd += remap_opts.split(' ')        
    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [".gfa", ".paf", ".fa_contigs"] and \
           os.path.isfile(os.path.join(work_dir, file_name + ".fa_contigs")):
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            if ext == '.paf':
                # apply the hacky naming correction so that subpaths have no special characterse in the hal (to make hubs happy)
                # this gets undone by hal2vg
                cactus_call(parameters=['sed', '-i', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e',
                                        '-e', 's/ /\t/g', os.path.join(work_dir, out_name)]) 
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(os.path.join(work_dir, out_name))
            
    return output_id_map, job.fileStore.writeGlobalFile(log_path)