Exemplo n.º 1
0
def computePAFCoverage(job, config_node, paf_id):
    """ compute the gaps in PAF coverage, store them as a bed file, and add the bed file's filestore id
    into the config's dna-brnn xml element """
    paf_file = job.fileStore.readGlobalFile(paf_id)
    bed_file = job.fileStore.getLocalTempFile()

    dnabrnn_node = None
    for node in config_node.findall("preprocessor"):
        if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
            dnabrnn_node = node
            break

    assert dnabrnn_node is not None

    min_length = max(
        1, getOptionalAttrib(dnabrnn_node, 'minLength', typeFn=int, default=0))

    cactus_call(
        parameters=['pafcoverage', paf_file, '-g', '-m',
                    str(min_length)],
        outfile=bed_file)

    dnabrnn_node.attrib["inputBedID"] = job.fileStore.writeGlobalFile(bed_file)

    return config_node
Exemplo n.º 2
0
def merge_gafs_into_paf(job, config, gaf_file_ids):
    """ Merge GAF alignments into a single PAF, applying some filters """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "mz_alignments.paf")
    gaf_paths = []
    for i, gaf_id in enumerate(gaf_file_ids):
        gaf_paths.append("mz_alignment_{}.gaf".format(i))
        job.fileStore.readGlobalFile(gaf_id,
                                     os.path.join(work_dir, gaf_paths[-1]))

    xml_node = findRequiredNode(config.xmlRoot, "refgraph")
    mzgaf2paf_opts = []
    mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float)
    if mz_filter:
        mzgaf2paf_opts += ['-u', str(mz_filter)]
    min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int)
    if min_mz:
        mzgaf2paf_opts += ['-m', str(min_mz)]
    mapq = getOptionalAttrib(xml_node, "minMAPQ", int)
    if mapq:
        mzgaf2paf_opts += ['-q', str(mapq)]
    gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int)
    if gaf_block:
        mzgaf2paf_opts += ['-b', str(gaf_block)]

    cactus_call(work_dir=work_dir,
                outfile=paf_path,
                parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts)

    # these are big, get rid of them as soon as we can (which is now)
    for gaf_id in gaf_file_ids:
        job.fileStore.deleteGlobalFile(gaf_id)

    return job.fileStore.writeGlobalFile(paf_path)
Exemplo n.º 3
0
    def run(self, fileStore):
        # Parse the "preprocessor" config xml element
        assert self.iteration < len(self.prepXmlElems)

        prepNode = self.prepXmlElems[self.iteration]
        prepOptions = PreprocessorOptions(chunkSize = int(prepNode.get("chunkSize", default="-1")),
                                          preprocessJob=prepNode.attrib["preprocessJob"],
                                          memory = int(prepNode.get("memory", default=0)),
                                          cpu = int(prepNode.get("cpu", default=1)),
                                          check = bool(int(prepNode.get("check", default="0"))),
                                          proportionToSample = getOptionalAttrib(prepNode, "proportionToSample", typeFn=float, default=1.0),
                                          unmask = getOptionalAttrib(prepNode, "unmask", typeFn=bool, default=False),
                                          lastzOptions = getOptionalAttrib(prepNode, "lastzOpts", default=""),
                                          minPeriod = getOptionalAttrib(prepNode, "minPeriod", typeFn=int, default=0),
                                          checkAssemblyHub = getOptionalAttrib(prepNode, "checkAssemblyHub", typeFn=bool, default=False))

        lastIteration = self.iteration == len(self.prepXmlElems) - 1

        if prepOptions.unmask:
            inSequence = fileStore.readGlobalFile(self.inSequenceID)
            unmaskedInputFile = fileStore.getLocalTempFile()
            unmaskFasta(inSequence, unmaskedInputFile)
            self.inSequenceID = fileStore.writeGlobalFile(inSequence)

        outSeqID = self.addChild(PreprocessSequence(prepOptions, self.inSequenceID)).rv()

        if lastIteration == False:
            return self.addFollowOn(BatchPreprocessor(self.prepXmlElems, outSeqID, self.iteration + 1)).rv()
        else:
            return outSeqID
Exemplo n.º 4
0
 def getPreprocessorActive(self, preprocessorJob, default_val=True):
     """Get active flag of preprocessor (first one with name match)"""
     for node in self.xmlRoot.findall("preprocessor"):
         if getOptionalAttrib(node, "preprocessJob") == preprocessorJob:
             return getOptionalAttrib(
                 node, "active", default="1" if default_val else "0") == "1"
     return default_val
Exemplo n.º 5
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
Exemplo n.º 6
0
def clip_vg(job, options, config, vg_path, vg_id):
    """ run clip-vg 
    """
    work_dir = job.fileStore.getLocalTempDir()
    is_decoy = vg_path == options.decoyGraph
    vg_path = os.path.join(work_dir, os.path.basename(vg_path))
    job.fileStore.readGlobalFile(vg_id, vg_path)
    out_path = vg_path + '.clip'

    cmd = ['clip-vg', vg_path, '-f']
    if options.clipLength is not None and not is_decoy:
        cmd += ['-u', str(options.clipLength)]
    for rs in options.rename:
        cmd += ['-r', rs]
    if options.reference:
        cmd += ['-e', options.reference]
    
    if getOptionalAttrib(findRequiredNode(config.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False):
        # our vg file has minigraph sequences -- we'll filter them out, along with any nodes
        # that don't appear in a non-minigraph path
        graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
        cmd += ['-d', graph_event]
        
    # sort while we're at it
    cmd = [cmd, ['vg', 'ids', '-s', '-']]
        
    cactus_call(parameters=cmd, outfile=out_path)

    # worth it
    cactus_call(parameters=['vg', 'validate', out_path])

    return job.fileStore.writeGlobalFile(out_path)
Exemplo n.º 7
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
Exemplo n.º 8
0
    def run(self, fileStore):
        # Parse the "preprocessor" config xml element     
        assert self.iteration < len(self.prepXmlElems)
        
        prepNode = self.prepXmlElems[self.iteration]
        prepOptions = PreprocessorOptions(chunkSize = int(prepNode.get("chunkSize", default="-1")),
                                          preprocessJob=prepNode.attrib["preprocessJob"],
                                          memory = int(prepNode.get("memory", default=0)),
                                          cpu = int(prepNode.get("cpu", default=1)),
                                          check = bool(int(prepNode.get("check", default="0"))),
                                          proportionToSample = getOptionalAttrib(prepNode, "proportionToSample", typeFn=float, default=1.0),
                                          unmask = getOptionalAttrib(prepNode, "unmask", typeFn=bool, default=False),
                                          lastzOptions = getOptionalAttrib(prepNode, "lastzOpts", default=""),
                                          minPeriod = getOptionalAttrib(prepNode, "minPeriod", typeFn=int, default="0"),
                                          checkAssemblyHub = getOptionalAttrib(prepNode, "checkAssemblyHub", typeFn=bool, default=False))
        
        lastIteration = self.iteration == len(self.prepXmlElems) - 1

        if prepOptions.unmask:
            inSequence = fileStore.readGlobalFile(self.inSequenceID)
            unmaskedInputFile = fileStore.getLocalTempFile()
            unmaskFasta(inSequence, unmaskedInputFile)
            self.inSequenceID = fileStore.writeGlobalFile(inSequence)

        outSeqID = self.addChild(PreprocessSequence(prepOptions, self.inSequenceID)).rv()
        
        if lastIteration == False:
            return self.addFollowOn(BatchPreprocessor(self.prepXmlElems, outSeqID, self.iteration + 1)).rv()
        else:
            return outSeqID
Exemplo n.º 9
0
 def __init__(self, prepXmlElems, inSequence, globalOutSequence, iteration=0):
     Target.__init__(self, time=0.0002)
     self.prepXmlElems = prepXmlElems
     self.inSequence = inSequence
     self.globalOutSequence = globalOutSequence
     prepNode = self.prepXmlElems[iteration]
     self.memory = getOptionalAttrib(prepNode, "memory", typeFn=int, default=sys.maxint)
     self.cpu = getOptionalAttrib(prepNode, "cpu", typeFn=int, default=sys.maxint)
     self.iteration = iteration
Exemplo n.º 10
0
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "cpu",
                                 typeFn=int,
                                 default=1)
    mg_cores = min(mg_cores, cpu_count())

    # doing the paf conversion is more efficient when done separately for each genome.  we can get away
    # with doing this if the universal filter (which needs to process everything at once) is disabled
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter",
                                           float)

    # do the mapping
    gaf_id_map = {}
    paf_id_map = {}

    for event, fa_path_fa_id in fa_id_map.items():
        fa_path = fa_path_fa_id[0]
        fa_id = fa_path_fa_id[1]
        minigraph_map_job = top_job.addChildJobFn(
            minigraph_map_one,
            config,
            event,
            fa_path,
            fa_id,
            gfa_id,
            keep_gaf or not paf_per_genome,
            paf_per_genome,
            # todo: estimate RAM
            cores=mg_cores,
            disk=5 * (fa_id.size + gfa_id.size))
        gaf_id_map[event] = minigraph_map_job.rv(0)
        paf_id_map[event] = minigraph_map_job.rv(1)

    # convert to paf
    if paf_per_genome:
        paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map)
    else:
        paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config,
                                           gaf_id_map)

    if not keep_gaf:
        gaf_id_map = None
    else:
        gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv()

    return paf_job.rv(), gaf_id_map
Exemplo n.º 11
0
def minigraph_map_one(job, config, event_name, fa_path, fa_file_id, gfa_file_id, gaf_output, paf_output):
    """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    fa_dir = job.fileStore.getLocalTempDir()
    fa_path = os.path.join(fa_dir, os.path.basename(fa_path))
    gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name))
    
    job.fileStore.readGlobalFile(gfa_file_id, gfa_path)
    job.fileStore.readGlobalFile(fa_file_id, fa_path)

    if fa_path.endswith('.gz'):
        fa_path = fa_path[:-3]
        cactus_call(parameters = ['gzip', '-d', '-c', fa_path + '.gz'], outfile=fa_path)

    # prepend the unique id before mapping so the GAF has cactus-compatible event names
    fa_path = prependUniqueIDs({event_name : fa_path}, work_dir, eventNameAsID=True)[event_name]

    # parse options from the config
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="")     
    opts_list = minigraph_opts.split()
    # add required options if not present
    if "-S" not in opts_list:
        opts_list += ["-S"]
    if "--write-mz" not in opts_list:
        opts_list += ["--write-mz"]
    if "-t" not in opts_list:
        opts_list += ["-t", str(int(job.cores))]

    cmd = ["minigraph",
           os.path.basename(gfa_path),
           os.path.basename(fa_path),
           "-o", os.path.basename(gaf_path)] + opts_list

    mask_filter = getOptionalAttrib(xml_node, "maskFilter", int, default=-1)
    if mask_filter >= 0:
        cmd[2] = '-'
        cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path), '-m', str(mask_filter)], cmd]
    
    cactus_call(work_dir=work_dir, parameters=cmd)

    paf_id, gaf_id = None, None
    if paf_output:
        # optional gaf->paf step.  we are not piping directly out of minigraph because mzgaf2paf's overlap filter
        # (which is usually on) requires 2 passes so it won't read stdin when it's enabled
        paf_id =  merge_gafs_into_paf(job, config, None, [gaf_path])
    if gaf_output:
        gaf_id = job.fileStore.writeGlobalFile(gaf_path)

    return gaf_id, paf_id
Exemplo n.º 12
0
def export_vg(job, hal_id, configWrapper, doVG, doGFA, checkpointInfo=None, resource_spec = False):
    """ use hal2vg to convert the HAL to vg format """

    if not resource_spec:
        # caller couldn't figure out the resrouces from hal_id promise.  do that
        # now and try again
        return job.addChildJobFn(export_vg, hal_id, configWrapper, doVG, doGFA, checkpointInfo,
                                 resource_spec = True,
                                 disk=hal_id.size * 3,
                                 memory=hal_id.size * 10).rv()
        
    work_dir = job.fileStore.getLocalTempDir()
    hal_path = os.path.join(work_dir, "out.hal")
    job.fileStore.readGlobalFile(hal_id, hal_path)
    
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
    hal2vg_opts = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "hal2vgOptions", default="")
    if hal2vg_opts:
        hal2vg_opts = hal2vg_opts.split(' ')
    else:
        hal2vg_opts = []
    ignore_events = []
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False):
        ignore_events.append(graph_event)
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeAncestor", typeFn=bool, default=False):
        ignore_events.append(configWrapper.getDefaultInternalNodePrefix() + '0')
    if ignore_events:
        hal2vg_opts += ['--ignoreGenomes', ','.join(ignore_events)]
    if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "prependGenomeNames", typeFn=bool, default=True):
        hal2vg_opts += ['--onlySequenceNames']

    vg_path = os.path.join(work_dir, "out.vg")
    cmd = ['hal2vg', hal_path] + hal2vg_opts

    cactus_call(parameters=cmd, outfile=vg_path)

    if checkpointInfo:
        write_s3(vg_path, os.path.splitext(checkpointInfo[1])[0] + '.vg', region=checkpointInfo[0])

    gfa_path = os.path.join(work_dir, "out.gfa.gz")
    if doGFA:
        gfa_cmd = [ ['vg', 'view', '-g', vg_path], ['gzip'] ]
        cactus_call(parameters=gfa_cmd, outfile=gfa_path)

        if checkpointInfo:
            write_s3(gfa_path, os.path.splitext(checkpointInfo[1])[0] + '.gfa.gz', region=checkpointInfo[0])

    vg_id = job.fileStore.writeGlobalFile(vg_path) if doVG else None
    gfa_id = job.fileStore.writeGlobalFile(gfa_path) if doGFA else None

    return vg_id, gfa_id
Exemplo n.º 13
0
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences = [], maskAlpha=False, clipAlpha=None,
                  maskPAF=None, inputEventNames=None, brnnCores=None):
    #Replace any constants
    configNode = ET.parse(configFile).getroot()
    if not outputSequences:
        outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir)
    else:
        assert len(outputSequences) == len(inputSequences)

    # Make sure we have the dna-brnn model in the filestore if we need it
    loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha = maskAlpha)
        
    if configNode.find("constants") != None:
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals()
    if maskAlpha or clipAlpha:
        ConfigWrapper(configNode).setPreprocessorActive("lastzRepeatMask", False)
        ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True)
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                if clipAlpha:
                    node.attrib["action"] = "clip"                    
    if brnnCores is not None:
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                node.attrib["cpu"] = brnnCores
        
    if not restart:
        inputSequenceIDs = []
        for seq in inputSequences:
            logger.info("Importing {}".format(seq))
            inputSequenceIDs.append(toil.importFile(makeURL(seq)))
        if maskPAF:
            inputPAFID = toil.importFile(makeURL(maskPAF))
        else:
            inputPAFID = None
        unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences, inputSequenceIDs, inputEventNames, maskPAF, inputPAFID)
        outputSequenceIDs = toil.start(unzip_job)
    else:
        outputSequenceIDs = toil.restart()
    for seqID, path in zip(outputSequenceIDs, outputSequences):
        try:
            iter(seqID)
            # dna-brnn will output a couple of bed files.  we scrape those out here
            toil.exportFile(seqID[0], makeURL(path))
            toil.exportFile(seqID[1], makeURL(path) + '.bed')
            toil.exportFile(seqID[2], makeURL(path) + '.mask.bed')
        except:
            toil.exportFile(seqID, makeURL(path))
Exemplo n.º 14
0
def merge_gafs_into_paf(job, config, gaf_file_id_map, gaf_paths=[]):
    """ Merge GAF alignments into a single PAF, applying some filters """

    work_dir = job.fileStore.getLocalTempDir()
    paf_path = os.path.join(work_dir, "mz_alignments.paf")
    if not gaf_paths:
        for event, gaf_id in gaf_file_id_map.items():
            gaf_paths.append("{}.gaf".format(event))
            job.fileStore.readGlobalFile(gaf_id,
                                         os.path.join(work_dir, gaf_paths[-1]))

    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    mzgaf2paf_opts = []
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # this must be consistent with prependUniqueIDs() in cactus_workflow.py
    mzgaf2paf_opts += ['-p', 'id={}|'.format(graph_event)]
    mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float)
    if mz_filter:
        mzgaf2paf_opts += ['-u', str(mz_filter)]
    if getOptionalAttrib(xml_node,
                         "nodeBasedUniversal",
                         typeFn=bool,
                         default=False):
        mzgaf2paf_opts += ['-n']
    if getOptionalAttrib(xml_node,
                         "strictUniversal",
                         typeFn=bool,
                         default=False):
        mzgaf2paf_opts += ['-i']
    min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int)
    if min_mz:
        mzgaf2paf_opts += ['-m', str(min_mz)]
    mapq = getOptionalAttrib(xml_node, "minMAPQ", int)
    if mapq:
        mzgaf2paf_opts += ['-q', str(mapq)]
    gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int)
    if gaf_block:
        mzgaf2paf_opts += ['-b', str(gaf_block)]
    gaf_node = getOptionalAttrib(xml_node, "minGAFNodeLength", int)
    if gaf_node:
        mzgaf2paf_opts += ['-s', str(gaf_node)]
    overlap_filter_len = getOptionalAttrib(xml_node,
                                           "minGAFQueryOverlapFilter", int)
    if overlap_filter_len:
        mzgaf2paf_opts += ['-o', str(overlap_filter_len)]

    cactus_call(outfile=paf_path,
                parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts)

    return job.fileStore.writeGlobalFile(paf_path)
Exemplo n.º 15
0
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map):
    """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and 
    add them to the events where possible"""

    # can't do anything without a reference
    if not options.reference:
        logger.info("Skipping minimap2 fallback as --reference was not specified")
        return None, None
    # todo: also skip if no ambgious sequences
    
    ref_path, ref_id = seqIDMap[options.reference]
    mm_mem = ref_id.size * 5
    if seqIDMap[options.reference][0].endswith('.gz'):
        mm_mem *= 4
    mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem)
    mm_map_root_job = Job()
    mm_index_job.addFollowOn(mm_map_root_job)
    
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    if amb_name not in output_id_map:
        logger.info("Skipping minmap2 fallback as no ambigious sequences found")
        return None, None

    # map every ambgiuous sequence against the reference in parallel
    paf_ids = []
    ambiguous_seq_id_map = {}
    for event, fa_id in output_id_map[amb_name]['fa'].items():
        paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0],
                                                disk=ref_id.size * 3, memory=mm_mem)
        paf_ids.append(paf_job.rv())
        ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id)

    return paf_ids, ambiguous_seq_id_map
Exemplo n.º 16
0
def loadDnaBrnnModel(toil, configNode, maskAlpha=False):
    """ store the model in a toil file id so it can be used in any workflow """
    for prepXml in configNode.findall("preprocessor"):
        if prepXml.attrib["preprocessJob"] == "dna-brnn":
            if maskAlpha or getOptionalAttrib(
                    prepXml, "active", typeFn=bool, default=False):
                dnabrnnOpts = getOptionalAttrib(prepXml,
                                                "dna-brnnOpts",
                                                default="")
                if '-i' in dnabrnnOpts:
                    model_path = dnabrnnOpts[dnabrnnOpts.index('-i') + 1]
                else:
                    model_path = os.path.join(cactusRootPath(),
                                              'attcc-alpha.knm')
                os.environ["CACTUS_DNA_BRNN_MODEL_ID"] = toil.importFile(
                    makeURL(model_path))
Exemplo n.º 17
0
    def substituteAllDivergenceContolledParametersWithLiterals(
            self, maxDivergence):
        constants = findRequiredNode(self.xmlRoot, "constants")
        divergences = constants.find("divergences")
        messages = []
        if divergences != None:
            useDefaultDivergences = getOptionalAttrib(divergences,
                                                      attribName="useDefault",
                                                      typeFn=bool,
                                                      default=False)

            def replaceAllDivergenceParameters(node):
                for child in node:
                    if child.tag == "divergence":
                        attribName = child.attrib["argName"]
                        arg = child.attrib["default"]
                        divergence = sys.maxsize
                        if not useDefaultDivergences:
                            for i in list(child.attrib.keys()):
                                if i in list(divergences.attrib.keys()):
                                    j = float(divergences.attrib[i])
                                    if j < divergence and j >= maxDivergence:
                                        arg = child.attrib[i]
                                        divergence = j
                        messages.append(
                            "Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)"
                            % (attribName, arg, node.tag, divergence,
                               maxDivergence, useDefaultDivergences))
                        node.attrib[attribName] = arg
                    else:
                        replaceAllDivergenceParameters(child)

            replaceAllDivergenceParameters(self.xmlRoot)
        return messages
Exemplo n.º 18
0
 def substituteAllDivergenceContolledParametersWithLiterals(self, maxDivergence):
     constants = findRequiredNode(self.xmlRoot, "constants")
     divergences = constants.find("divergences")
     messages = []
     if divergences != None:
         useDefaultDivergences = getOptionalAttrib(divergences, attribName="useDefault", typeFn=bool, default=False)
         def replaceAllDivergenceParameters(node):
             for child in node:
                 if child.tag == "divergence":
                     attribName = child.attrib["argName"]
                     arg = child.attrib["default"]
                     divergence = sys.maxint
                     if not useDefaultDivergences:
                         for i in child.attrib.keys():
                             if i in divergences.attrib.keys():
                                 j = float(divergences.attrib[i])
                                 if j < divergence and j >= maxDivergence:
                                     arg = child.attrib[i]
                                     divergence = j
                     messages.append("Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)" % (attribName, arg, node.tag, divergence, maxDivergence, useDefaultDivergences))
                     node.attrib[attribName] = arg
                 else:
                     replaceAllDivergenceParameters(child)
         replaceAllDivergenceParameters(self.xmlRoot)
     return messages
Exemplo n.º 19
0
 def setPreprocessorActive(self, preprocessorJob, state):
     """Set active flag of preprocessor """
     assert state in (True, False)
     set_count = 0
     for node in self.xmlRoot.findall("preprocessor"):
         if getOptionalAttrib(node, "preprocessJob") == preprocessorJob:
             node.attrib["active"] = "1" if state else "0"
             set_count += 1
     return set_count
Exemplo n.º 20
0
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map):
    """ combine the output of two runs of gather_fas.  the first is the contigs determined by minigraph,
    the second from remapping the ambigious contigs with minimap2 """

    root_job = Job()
    job.addChild(root_job)

    # no ambiguous remappings, nothing to do
    if not remap_id_map or len(remap_id_map) == 0:
        return original_id_map

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")

    # note: we're not handling case where 100% of a given reference contigs are ambiguous
    for ref_contig in original_id_map:
        if ref_contig == amb_name:
            # for ambiguous sequence, we overwrite and don't combine
            if ref_contig in remap_id_map:
                original_id_map[ref_contig] = remap_id_map[ref_contig]
            else:
                original_id_map[ref_contig] = None
        elif ref_contig in remap_id_map:
            total_size = 0
            for event in original_id_map[ref_contig]['fa']:
                total_size += original_id_map[ref_contig]['fa'][event].size
                if event in remap_id_map[ref_contig]['fa']:
                    total_size += remap_id_map[ref_contig]['fa'][event].size
            original_id_map[ref_contig] = root_job.addChildJobFn(
                combine_ref_contig_splits,
                original_id_map[ref_contig],
                remap_id_map[ref_contig],
                disk=total_size * 4).rv()

    return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map,
                                     original_id_map, remap_id_map, amb_name,
                                     graph_event).rv()
Exemplo n.º 21
0
    def run(self):
        # Parse the "preprocessor" config xml element
        assert self.iteration < len(self.prepXmlElems)

        prepNode = self.prepXmlElems[self.iteration]
        prepOptions = PreprocessorOptions(
            int(prepNode.get("chunkSize", default="-1")),
            prepNode.attrib["preprocessorString"],
            int(self.memory),
            int(self.cpu),
            bool(int(prepNode.get("check", default="0"))),
            getOptionalAttrib(prepNode, "proportionToSample", typeFn=float, default=1.0),
            getOptionalAttrib(prepNode, "unmask", typeFn=bool, default=False),
        )

        # output to temporary directory unless we are on the last iteration
        lastIteration = self.iteration == len(self.prepXmlElems) - 1
        if lastIteration == False:
            outSeq = os.path.join(self.getGlobalTempDir(), str(self.iteration))
        else:
            outSeq = self.globalOutSequence

        if prepOptions.unmask:
            unmaskedInputFile = getTempFile(rootDir=self.getGlobalTempDir())
            unmaskFasta(self.inSequence, unmaskedInputFile)
            self.inSequence = unmaskedInputFile

        if prepOptions.chunkSize <= 0:  # In this first case we don't need to break up the sequence
            self.addChildTarget(PreprocessChunk(prepOptions, [self.inSequence], 1.0, self.inSequence, outSeq))
        else:
            self.addChildTarget(PreprocessSequence(prepOptions, self.inSequence, outSeq))

        if lastIteration == False:
            self.setFollowOnTarget(
                BatchPreprocessor(self.prepXmlElems, outSeq, self.globalOutSequence, self.iteration + 1)
            )
        else:
            self.setFollowOnTarget(BatchPreprocessorEnd(self.globalOutSequence))
Exemplo n.º 22
0
    def run(self, fileStore):
        outputSequenceIDs = []
        if self.eventNames:
            assert len(self.eventNames) == len(self.inputSequenceIDs)
            configs = []
            for eventName in self.eventNames:
                conf = copy.deepcopy(self.configNode)
                for node in conf.findall("preprocessor"):
                    if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                        node.attrib["eventName"] = eventName
                # if we don't make different configs, the same reference somehow gets passed to mulitple childs below
                configs.append(conf)

        for i, inputSequenceID in enumerate(self.inputSequenceIDs):
            confNode = configs[i] if self.eventNames else self.configNode
            outputSequenceIDs.append(self.addChild(CactusPreprocessor2(inputSequenceID, confNode)).rv())
        return outputSequenceIDs
Exemplo n.º 23
0
def preprocess_input_sequences(job,
                               configWrapper,
                               project,
                               cactusWorkflowArguments,
                               pafMaskFilter=None,
                               referenceEvent=None):
    """ update the workflow arguments in place with unzipped version of any input fastas whose paths 
    end in .gz, 
    if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed.
    Note that the beds will need unique ids prepended just like the fastas...
    """
    head_job = Job()
    job.addChild(head_job)
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    mask_bed_ids = {}
    events = []
    updated_seq_ids = []
    for g, seqID in ingroupsAndOriginalIDs:
        zipped = project.inputSequenceMap[g].endswith('.gz')
        do_filter = pafMaskFilter and g not in [graph_event, referenceEvent]
        if zipped or do_filter:
            prepend_id_job = head_job.addChildJobFn(
                preprocess_input_sequence, g, seqID,
                project.inputSequenceMap[g], pafMaskFilter)
            updated_seq_id, mask_bed_id = prepend_id_job.rv(
                0), prepend_id_job.rv(1)
            if zipped:
                events.append(g)
                updated_seq_ids.append(updated_seq_id)
            if do_filter:
                mask_bed_ids[g] = mask_bed_id

    return head_job.addFollowOnJobFn(
        resolve_id_promises, events, updated_seq_ids,
        cactusWorkflowArguments).rv(), mask_bed_ids
Exemplo n.º 24
0
def minigraph_map_one(job, config, event_name, fa_file_id, gfa_file_id,
                      ignore_softmasked):
    """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "minigraph.gfa")
    fa_path = os.path.join(work_dir, "{}.fa".format(event_name))
    gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name))

    job.fileStore.readGlobalFile(gfa_file_id, gfa_path)
    job.fileStore.readGlobalFile(fa_file_id, fa_path)

    # parse options from the config
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    minigraph_opts = getOptionalAttrib(xml_node,
                                       "minigraphMapOptions",
                                       str,
                                       default="")
    opts_list = minigraph_opts.split()
    # add required options if not present
    if "-S" not in opts_list:
        opts_list += ["-S"]
    if "--write-mz" not in opts_list:
        opts_list += ["--write-mz"]
    if "-t" not in opts_list:
        opts_list += ["-t", str(int(job.cores))]

    cmd = [
        "minigraph",
        os.path.basename(gfa_path),
        os.path.basename(fa_path), "-o",
        os.path.basename(gaf_path)
    ] + opts_list

    if ignore_softmasked:
        cmd[2] = '-'
        cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path)], cmd]

    # todo: pipe into gzip directly as these files can be huge!!! (requires gzip support be added to mzgaf2paf)
    cactus_call(work_dir=work_dir, parameters=cmd)

    return job.fileStore.writeGlobalFile(gaf_path)
Exemplo n.º 25
0
def split_gfa(job, config, gfa_id, paf_id, ref_contigs, other_contig,
              reference_event):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")

    job.fileStore.readGlobalFile(gfa_id, gfa_path)
    job.fileStore.readGlobalFile(paf_id, paf_path)

    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                       "minQueryCoverage",
                                       default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                         "minQueryUniqueness",
                                         default="0")
    amb_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                   "graphmap_split"),
                                  "ambiguousName",
                                  default="_AMBIGUOUS_")

    cmd = [
        'rgfa-split', '-i', 'id={}|'.format(mg_id), '-G', '-g', gfa_path, '-p',
        paf_path, '-b', out_prefix, '-n', query_coverage, '-Q',
        query_uniqueness, '-a', amb_event
    ]
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]

    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [
                ".gfa", ".paf", ".fa_contigs"
        ]:
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, out_name))

    return output_id_map
Exemplo n.º 26
0
def runCactusGraphMap(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "refgraph"),
                                            "assemblyName",
                                            default="__MINIGRAPH_SEQUENCES__")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            logger.info("Genomes for graphmap, {}".format(seqFile.pathMap))

            if not options.outputFasta and graph_event not in seqFile.pathMap:
                raise RuntimeError(
                    "{} assembly not found in seqfile so it must be specified with --outputFasta"
                    .format(graph_event))

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            for genome, seq in seqFile.pathMap.items():
                if genome != graph_event:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    seqIDMap[genome] = toil.importFile(seq)

            # run the workflow
            paf_id, gfa_fa_id = toil.start(
                Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap,
                              gfa_id, graph_event))

        #export the paf
        toil.exportFile(paf_id, makeURL(options.outputPAF))
        if gfa_fa_id:
            toil.exportFile(gfa_fa_id, makeURL(options.outputFasta))

        # update the input seqfile (in place!)
        add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta),
                              graph_event)
Exemplo n.º 27
0
    def run(self, fileStore):
        # Parse the "preprocessor" config xml element
        assert self.iteration < len(self.prepXmlElems)

        lastIteration = self.iteration == len(self.prepXmlElems) - 1

        prepNode = self.prepXmlElems[self.iteration]
        if getOptionalAttrib(prepNode, "active", typeFn=bool, default=True):
            prepOptions = PreprocessorOptions(
                chunkSize=int(prepNode.get("chunkSize", default="-1")),
                preprocessJob=prepNode.attrib["preprocessJob"],
                memory=int(prepNode.get("memory", default=0)),
                cpu=int(prepNode.get("cpu", default=1)),
                check=bool(int(prepNode.get("check", default="0"))),
                proportionToSample=getOptionalAttrib(prepNode,
                                                     "proportionToSample",
                                                     typeFn=float,
                                                     default=1.0),
                unmask=getOptionalAttrib(prepNode,
                                         "unmask",
                                         typeFn=bool,
                                         default=False),
                lastzOptions=getOptionalAttrib(prepNode,
                                               "lastzOpts",
                                               default=""),
                minPeriod=getOptionalAttrib(prepNode,
                                            "minPeriod",
                                            typeFn=int,
                                            default=0),
                checkAssemblyHub=getOptionalAttrib(prepNode,
                                                   "checkAssemblyHub",
                                                   typeFn=bool,
                                                   default=False),
                gpuLastz=getOptionalAttrib(prepNode,
                                           "gpuLastz",
                                           typeFn=bool,
                                           default=False),
                dnabrnnOpts=getOptionalAttrib(prepNode,
                                              "dna-brnnOpts",
                                              default=""),
                dnabrnnLength=getOptionalAttrib(prepNode,
                                                "minLength",
                                                typeFn=int,
                                                default=1),
                dnabrnnMerge=getOptionalAttrib(prepNode,
                                               "mergeLength",
                                               typeFn=int,
                                               default=0),
                dnabrnnAction=getOptionalAttrib(prepNode,
                                                "action",
                                                typeFn=str,
                                                default="softmask"),
                dnabrnnInputBedID=getOptionalAttrib(prepNode,
                                                    "inputBedID",
                                                    typeFn=str,
                                                    default=None),
                dnabrnnEventName=getOptionalAttrib(prepNode,
                                                   "eventName",
                                                   typeFn=str,
                                                   default=None))

            if prepOptions.unmask:
                inSequence = fileStore.readGlobalFile(self.inSequenceID)
                unmaskedInputFile = fileStore.getLocalTempFile()
                unmaskFasta(inSequence, unmaskedInputFile)
                self.inSequenceID = fileStore.writeGlobalFile(
                    unmaskedInputFile)

            outSeqID = self.addChild(
                PreprocessSequence(prepOptions, self.inSequenceID)).rv()
        else:
            logger.info("Skipping inactive preprocessor {}".format(
                prepNode.attrib["preprocessJob"]))
            outSeqID = self.inSequenceID

        if lastIteration == False:
            return self.addFollowOn(
                BatchPreprocessor(self.prepXmlElems, outSeqID,
                                  self.iteration + 1)).rv()
        else:
            return outSeqID
Exemplo n.º 28
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("inSeqFile",
                        type=str,
                        nargs='?',
                        default=None,
                        help="Input Seq file")
    parser.add_argument(
        "outSeqFile",
        type=str,
        nargs='?',
        default=None,
        help="Output Seq file (ex generated with cactus-prepare)")
    parser.add_argument("--configFile",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--inputNames",
        nargs='*',
        help=
        'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)'
    )
    parser.add_argument(
        "--inPaths",
        nargs='*',
        help=
        'Space-separated list of input fasta paths (to be used in place of --inSeqFile'
    )
    parser.add_argument(
        "--outPaths",
        nargs='*',
        help=
        'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)'
    )
    parser.add_argument("--maskAlpha",
                        action='store_true',
                        help='Use dna-brnn instead of lastz for repeatmasking')
    parser.add_argument(
        "--clipAlpha",
        action='store_true',
        help=
        'use dna-brnn instead of lastz for repeatmasking.  Also, clip sequence using given minimum length instead of softmasking'
    )
    parser.add_argument(
        "--ignore",
        nargs='*',
        help='Space-separate list of genomes from inSeqFile to ignore',
        default=[])
    parser.add_argument(
        "--maskPAF",
        type=str,
        help=
        'Incorporate coverage gaps from given PAF when masking.  Only implemented for dna-brnn masking'
    )
    parser.add_argument(
        "--brnnCores",
        type=int,
        help=
        'Specify number of cores for each dna-brnn job (overriding default value from the config)'
    )
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # we have two modes: operate directly on paths or rely on the seqfiles.  they cannot be mixed
    if options.inSeqFile or options.outSeqFile:
        if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths:
            raise RuntimeError(
                '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths'
            )
    elif options.inPaths or options.outPaths:
        if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile:
            raise RuntimeError(
                '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile'
            )
        if len(options.inPaths) != len(options.outPaths):
            raise RuntimeError(
                '--inPaths and --outPaths must have the same number of arguments'
            )
    else:
        raise RuntimeError(
            '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input'
        )
    if options.maskAlpha and options.clipAlpha:
        raise RuntimeError(
            '--maskAlpha and --clipAlpha cannot be used together')
    if options.clipAlpha:
        options.maskAlpha = True
    if options.maskPAF and not options.inputNames and not options.inSeqFile:
        raise RuntimeError(
            '--maskPAF requires event names specified wither with an input seqfile or with --inputNames'
        )
    if options.ignore and options.clipAlpha is None:
        raise RuntimeError('--ignore can only be used with --clipAlpha')

    inSeqPaths = []
    outSeqPaths = []
    inNames = options.inputNames
    eventNames = []

    #load cactus config
    configNode = ET.parse(options.configFile).getroot()
    #we never want to preprocess minigraph sequences
    graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    options.ignore.append(graph_event)

    # mine the paths out of the seqfiles
    if options.inSeqFile:
        inSeqFile = SeqFile(options.inSeqFile)
        outSeqFile = SeqFile(options.outSeqFile)

        if not inNames:
            inNames = [
                inSeqFile.tree.getName(node)
                for node in inSeqFile.tree.getLeaves()
            ]

        for inName in inNames:
            if inName in options.ignore:
                # "convenience" functionality: we let the --ignore option update the output seqfile
                # to reflect the fact that we're not touching the original input
                outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName]
                continue
            if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap:
                raise RuntimeError(
                    '{} not present in input and output Seq files'.format(
                        inName))
            inPath = inSeqFile.pathMap[inName]
            outPath = outSeqFile.pathMap[inName]
            if os.path.isdir(inPath):
                try:
                    os.makedirs(outPath)
                except:
                    pass
                assert os.path.isdir(inPath) == os.path.isdir(outPath)
                inSeqPaths += [
                    os.path.join(inPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
                outSeqPaths += [
                    os.path.join(outPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
            else:
                inSeqPaths += [inPath]
                outSeqPaths += [outPath]
            eventNames.append(inName)

        if options.ignore:
            # see comment above
            with open(options.outSeqFile, 'w') as outSF:
                outSF.write(str(outSeqFile))

    # we got path names directly from the command line
    else:
        inSeqPaths = options.inPaths
        outSeqPaths = options.outPaths

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=None,
                      configFile=options.configFile,
                      inputSequences=inSeqPaths,
                      toil=toil,
                      restart=options.restart,
                      outputSequences=outSeqPaths,
                      maskAlpha=options.maskAlpha,
                      clipAlpha=options.clipAlpha,
                      maskPAF=options.maskPAF,
                      inputEventNames=eventNames,
                      brnnCores=options.brnnCores)
Exemplo n.º 29
0
    def run(self):
        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        self.options.experimentFile = self.project.expMap[self.event]
        expXml = ET.parse(self.options.experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configXml = ET.parse(experiment.getConfigPath()).getroot()
        configWrapper = ConfigWrapper(configXml)

        # need at least 3 processes for every event when using ktserver:
        # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server
        if experiment.getDbType() == "kyoto_tycoon":            
            maxParallel = min(len(self.project.expMap),
                             configWrapper.getMaxParallelSubtrees()) 
            if self.options.batchSystem == "singleMachine":
                if int(self.options.maxThreads) < maxParallel * 3:
                    raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees()))
            else:
                if int(self.options.maxCpus) < maxParallel * 3:
                    raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees()))
                    
        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        workFlowArgs = CactusWorkflowArguments(self.options)
        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.overwrite = self.options.overwrite
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        
        experiment = ExperimentWrapper(workFlowArgs.experimentNode)

        donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE")
        doneDone = os.path.isfile(donePath)
        refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath())
        halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and
                                                os.path.isfile(experiment.getHALPath()))
                                                               
        if not workFlowArgs.overwrite and doneDone and refDone and halDone:
            self.logToMaster("Skipping %s because it is already done and overwrite is disabled" %
                             self.event)
        else:
            system("rm -f %s" % donePath)
            # delete database 
            # and overwrite specified (or if reference not present)
            dbPath = os.path.join(experiment.getDbDir(), 
                                  experiment.getDbName())
            seqPath = os.path.join(experiment.getDbDir(), "sequences")
            system("rm -f %s* %s %s" % (dbPath, seqPath, 
                                        experiment.getReferencePath()))

            if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None:
                # Use the trimming strategy to blast ingroups vs outgroups.
                self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast"))
            else:
                self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs,
                                                     phaseName="setup"))
        logger.info("Going to create alignments and define the cactus tree")

        self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
Exemplo n.º 30
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    if not gfa_id and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remap", typeFn=bool, default=False):
        # also bypass if remapping is off in the config (we know it's the second pass because gfa_id is None)
        return [None, None]
    
    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)
        
    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append('{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)
    
    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    cmd = ['rgfa-split',
           '-p', paf_path,
           '-b', out_prefix,
           '-n', query_coverage,
           '-N', small_query_coverage,
           '-T', small_coverage_threshold,
           '-Q', query_uniqueness,
           '-P', max_gap,
           '-a', amb_name,
           '-L', log_path]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]
    # optional stuff added to second pass:
    if not gfa_id:
        remap_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=None)
        if remap_opts:
            cmd += remap_opts.split(' ')        
    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [".gfa", ".paf", ".fa_contigs"] and \
           os.path.isfile(os.path.join(work_dir, file_name + ".fa_contigs")):
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            if ext == '.paf':
                # apply the hacky naming correction so that subpaths have no special characterse in the hal (to make hubs happy)
                # this gets undone by hal2vg
                cactus_call(parameters=['sed', '-i', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e',
                                        '-e', 's/ /\t/g', os.path.join(work_dir, out_name)]) 
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(os.path.join(work_dir, out_name))
            
    return output_id_map, job.fileStore.writeGlobalFile(log_path)
Exemplo n.º 31
0
def export_split_data(toil, input_seq_id_map, output_id_map, split_log_ids,
                      output_dir, config):
    """ download all the split data locally """

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")

    chrom_file_map = {}

    for ref_contig in output_id_map.keys():
        ref_contig_path = os.path.join(output_dir, ref_contig)
        if not os.path.isdir(
                ref_contig_path) and not ref_contig_path.startswith('s3://'):
            os.makedirs(ref_contig_path)

        # GFA: <output_dir>/<contig>/<contig>.gfa
        if 'gfa' in output_id_map[ref_contig]:
            # we do this check because no gfa made for ambiguous sequences "contig"
            toil.exportFile(
                output_id_map[ref_contig]['gfa'],
                makeURL(
                    os.path.join(ref_contig_path,
                                 '{}.gfa'.format(ref_contig))))

        # PAF: <output_dir>/<contig>/<contig>.paf
        paf_path = os.path.join(ref_contig_path, '{}.paf'.format(ref_contig))
        toil.exportFile(output_id_map[ref_contig]['paf'], makeURL(paf_path))

        # Fasta: <output_dir>/<contig>/fasta/<event>_<contig>.fa ..
        seq_file_map = {}
        for event, ref_contig_fa_id in output_id_map[ref_contig]['fa'].items():
            fa_base = os.path.join(ref_contig_path, 'fasta')
            if not os.path.isdir(fa_base) and not fa_base.startswith('s3://'):
                os.makedirs(fa_base)
            fa_path = makeURL(
                os.path.join(fa_base, '{}_{}.fa'.format(event, ref_contig)))
            if input_seq_id_map[event][0].endswith('.gz'):
                fa_path += '.gz'
            seq_file_map[event] = fa_path
            toil.exportFile(ref_contig_fa_id, fa_path)

        # Seqfile: <output_dir>/seqfiles/<contig>.seqfile
        seq_file_path = os.path.join(output_dir, 'seqfiles',
                                     '{}.seqfile'.format(ref_contig))
        if seq_file_path.startswith('s3://'):
            seq_file_temp_path = getTempFile()
        else:
            seq_file_temp_path = seq_file_path
            if not os.path.isdir(os.path.dirname(seq_file_path)):
                os.makedirs(os.path.dirname(seq_file_path))
        with open(seq_file_temp_path, 'w') as seq_file:
            for event, fa_path in seq_file_map.items():
                # cactus can't handle empty fastas.  if there are no sequences for a sample for this
                # contig, just don't add it.
                if output_id_map[ref_contig]['fa'][event].size > 0:
                    seq_file.write('{}\t{}\n'.format(event, fa_path))
        if seq_file_path.startswith('s3://'):
            write_s3(seq_file_temp_path, seq_file_path)

        # Top-level seqfile
        chrom_file_map[ref_contig] = seq_file_path, paf_path

    # Chromfile : <coutput_dir>/chromfile.txt
    chrom_file_path = os.path.join(output_dir, 'chromfile.txt')
    if chrom_file_path.startswith('s3://'):
        chrom_file_temp_path = getTempFile()
    else:
        chrom_file_temp_path = chrom_file_path
    with open(chrom_file_temp_path, 'w') as chromfile:
        for ref_contig, seqfile_paf in chrom_file_map.items():
            if ref_contig != amb_name:
                seqfile, paf = seqfile_paf[0], seqfile_paf[1]
                if seqfile.startswith('s3://'):
                    # no use to have absolute s3 reference as cactus-align requires seqfiles passed locally
                    seqfile = 'seqfiles/{}'.format(os.path.basename(seqfile))
                chromfile.write('{}\t{}\t{}\n'.format(ref_contig, seqfile,
                                                      paf))
    if chrom_file_path.startswith('s3://'):
        write_s3(chrom_file_temp_path, chrom_file_path)

    toil.exportFile(split_log_ids[0],
                    makeURL(os.path.join(output_dir, 'minigraph.split.log')))
    if split_log_ids[1]:
        toil.exportFile(
            split_log_ids[1],
            makeURL(os.path.join(output_dir, 'minimap2.ambiguous.split.log')))
Exemplo n.º 32
0
def combine_paf_splits(job, options, config, seq_id_map, original_id_map, orig_amb_entry,
                       remap_id_map, amb_name, graph_event):
    """ pull out PAF entries for contigs that were ambiguous in the first round but assigned by minimap2
    then add them to the chromosome PAFs     
    """

    if amb_name not in original_id_map:
        return original_id_map

    work_dir = job.fileStore.getLocalTempDir()
    amb_paf_path = os.path.join(work_dir, 'amb.paf')
    job.fileStore.readGlobalFile(orig_amb_entry['paf'], amb_paf_path, mutable=True)

    # use_minimap_paf = True: return the minimap2 mappings for ambiguous contigs in final output
    # use_minimap_paf = False: ambiguous contigs are assigned to chromosomes base on minimap2, but their minigraph 
    #                          alignments are returned in the final paf"""
    use_minimap_paf = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "useMinimapPAF",
                                        typeFn=bool, default=False)

    # it's simpler not to support both codepaths right now.  the main issue is that -u can cause contigs to be split
    # in which case they get renamed, so pulling them in from the existing PAF would require a pass to resolove all the
    # offsets
    if not use_minimap_paf and '-u' in getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions",
                                                         default=""):
        raise RuntimeError("useMinimapPAF must be set when -u present in remapSplitOptions")

    for ref_contig in remap_id_map.keys():
        if ref_contig != amb_name and ref_contig in original_id_map:

            # make a set of all minigraph nodes in this contig
            mg_fa_path = os.path.join(work_dir, '{}.{}.fa'.format(graph_event, ref_contig))
            if seq_id_map[graph_event][0].endswith('.gz'):
                mg_fa_path += '.gz'
            mg_contigs_path = os.path.join(work_dir, '{}.contigs'.format(graph_event))
            job.fileStore.readGlobalFile(original_id_map[ref_contig]['fa'][graph_event], mg_fa_path, mutable=True)
            cactus_call(parameters=[['zcat' if mg_fa_path.endswith('.gz') else 'cat', mg_fa_path],
                                    ['grep', '>'], ['cut', '-c', '2-']], outfile=mg_contigs_path)
            mg_contig_set = set()
            with open(mg_contigs_path, 'r') as mg_contigs_file:
                for line in mg_contigs_file:
                    mg_contig_set.add('id={}|{}'.format(graph_event, line.strip()))
            os.remove(mg_fa_path)
            os.remove(mg_contigs_path)

            #make a set of all the query contigs that we want to remove from ambiguous and add to this contig
            query_contig_set = set()

            for event in remap_id_map[ref_contig]['fa']:
                if event != graph_event and remap_id_map[ref_contig]['fa'][event].size > 0:
                    # read the contigs assigned to this sample for this chromosome by scanning fasta headers
                    tmp_fa_path = os.path.join(work_dir, 'tmp.fa')
                    if seq_id_map[event][0].endswith('.gz'):
                        tmp_fa_path += '.gz'
                    if os.path.isfile(tmp_fa_path):
                        os.remove(tmp_fa_path)
                    job.fileStore.readGlobalFile(remap_id_map[ref_contig]['fa'][event], tmp_fa_path, mutable=True)
                    contigs_path = os.path.join(work_dir, '{}.contigs'.format(event))
                    cactus_call(parameters=[['zcat' if tmp_fa_path.endswith('.gz') else 'cat', tmp_fa_path],
                                            ['grep', '>'], ['cut', '-c', '2-']], outfile=contigs_path)
                    # add them to the grep
                    with open(contigs_path, 'r') as contigs_file:
                        for line in contigs_file:
                            query_contig_set.add('id={}|{}'.format(event, line.strip()))

            if query_contig_set:
                # pull out remapped contigs into this path
                new_contig_path = os.path.join(work_dir, '{}.remap.paf'.format(ref_contig))
                do_append = False
                if ref_contig in original_id_map and 'paf' in original_id_map[ref_contig]:
                    job.fileStore.readGlobalFile(original_id_map[ref_contig]['paf'], new_contig_path, mutable=True)
                    do_append = True
                    
                # make an updated ambiguous paf with the contigs removed in this path                
                temp_contig_path = os.path.join(work_dir, amb_paf_path + '.temp.remove')                    
                with open(new_contig_path, 'a' if do_append else 'w') as new_contig_file, \
                     open(amb_paf_path, 'r') as amb_paf_file, \
                     open(temp_contig_path, 'w') as temp_contig_file:
                    # scan the ambgiuous paf from minigraph
                    for line in amb_paf_file:
                        toks = line.split('\t')
                        if len(toks) > 5 and toks[0] in query_contig_set:
                            if toks[5] in mg_contig_set and not use_minimap_paf:
                                # move the contig if both the query and target belong to reference contig
                                new_contig_file.write(line)
                        else:
                            # leave the contig in ambiguous
                            temp_contig_file.write(line)
                    if use_minimap_paf:
                        # if we're taking the contigs from minigraph, append them here (as they weren't added in
                        # the loop above)
                        minimap_paf_path = os.path.join(work_dir, '{}.minimap.paf'.format(ref_contig))
                        job.fileStore.readGlobalFile(remap_id_map[ref_contig]['paf'], minimap_paf_path)
                        with open(minimap_paf_path, 'r') as minimap_paf_file:
                            for line in minimap_paf_file:
                                toks = line.split('\t')
                                if len(toks) > 5:
                                    toks[5] = 'id={}|{}'.format(options.reference, toks[5])
                                new_contig_file.write('\t'.join(toks))
                        
                # update the map
                original_id_map[ref_contig]['paf'] = job.fileStore.writeGlobalFile(new_contig_path)
                # update the ambigious paf
                cactus_call(parameters=['mv', temp_contig_path, amb_paf_path])

    # update the ambiguous paf
    if amb_name in original_id_map and original_id_map[amb_name]:
        original_id_map[amb_name]['paf'] = job.fileStore.writeGlobalFile(amb_paf_path)
    else:
        assert os.path.getsize(amb_paf_path) == 0
    
    return original_id_map
Exemplo n.º 33
0
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig,
              reference_event, mask_bed_id):
    """ Use rgfa-split to divide a GFA and PAF into chromosomes.  The GFA must be in minigraph RGFA output using
    the desired reference. """

    if not paf_ids:
        # we can bypass when, ex, doing second pass on ambiguous sequences but not are present
        return [None, None]

    work_dir = job.fileStore.getLocalTempDir()
    gfa_path = os.path.join(work_dir, "mg.gfa")
    paf_path = os.path.join(work_dir, "mg.paf")
    out_prefix = os.path.join(work_dir, "split_")
    bed_path = os.path.join(work_dir, "mask.bed")
    log_path = os.path.join(work_dir, "split.log")
    if (mask_bed_id):
        job.fileStore.readGlobalFile(mask_bed_id, bed_path)

    if gfa_id:
        job.fileStore.readGlobalFile(gfa_id, gfa_path)

    paf_paths = []
    for i, paf_id in enumerate(paf_ids):
        paf_paths.append(
            '{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path)
        job.fileStore.readGlobalFile(paf_id, paf_paths[-1])
    if len(paf_paths) > 1:
        catFiles(paf_paths, paf_path)

    # get the minigraph "virutal" assembly name
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    # and look up its unique id prefix.  this will be needed to pick its contigs out of the list
    mg_id = graph_event

    # get the specificity filters
    query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                       "minQueryCoverage",
                                       default="0")
    small_query_coverage = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                             "minQuerySmallCoverage",
                                             default="0")
    small_coverage_threshold = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                                 "minQuerySmallThreshold",
                                                 default="0")
    query_uniqueness = getOptionalAttrib(findRequiredNode(
        config.xmlRoot, "graphmap_split"),
                                         "minQueryUniqueness",
                                         default="0")
    max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                 "graphmap_split"),
                                "maxGap",
                                default="0")
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")

    cmd = [
        'rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage,
        '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q',
        query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path
    ]
    if gfa_id:
        cmd += ['-g', gfa_path, '-G']
    if other_contig:
        cmd += ['-o', other_contig]
    if reference_event:
        cmd += ['-r', 'id={}|'.format(reference_event)]
    if mask_bed_id:
        cmd += ['-B', bed_path]
    min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "minMAPQ")
    if min_mapq:
        cmd += ['-A', min_mapq]

    for contig in ref_contigs:
        cmd += ['-c', contig]

    cactus_call(parameters=cmd, work_dir=work_dir)

    output_id_map = {}
    for out_name in os.listdir(work_dir):
        file_name, ext = os.path.splitext(out_name)
        if file_name.startswith(os.path.basename(out_prefix)) and ext in [
                ".gfa", ".paf", ".fa_contigs"
        ]:
            name = file_name[len(os.path.basename(out_prefix)):]
            if name not in output_id_map:
                output_id_map[name] = {}
            output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, out_name))

    return output_id_map, job.fileStore.writeGlobalFile(log_path)
Exemplo n.º 34
0
def runCactusGraphMapSplit(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the contigs if any
            ref_contigs = set(options.refContigs)
            # todo: use import?
            if options.refContigsFile:
                with open(options.refContigsFile, 'r') as rc_file:
                    for line in rc_file:
                        if len(line.strip()):
                            ref_contigs.add(line.strip().split()[0])

            if options.otherContig:
                assert options.otherContig not in ref_contigs

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "graphmap"),
                                            "assemblyName",
                                            default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the paf
            paf_id = toil.importFile(makeURL(options.graphmapPAF))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([
                seqFile.tree.getName(node)
                for node in seqFile.tree.getLeaves()
            ])

            if graph_event not in leaves:
                raise RuntimeError(
                    "Minigraph name {} not found in seqfile".format(
                        graph_event))
            if options.reference and options.reference not in leaves:
                raise RuntimeError(
                    "Name given with --reference {} not found in seqfile".
                    format(options.reference))

            for genome, seq in seqFile.pathMap.items():
                if genome in leaves:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_split_workflow, options, config,
                              seqIDMap, gfa_id, options.minigraphGFA, paf_id,
                              options.graphmapPAF, ref_contigs,
                              options.otherContig))

        #export the split data
        export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:],
                          options.outDir, config)