Пример #1
0
def validateDataset( datasetPath, dbsUrl):
    """
    _validateDataset_
    
    Util method to check that the datasetPath provided
    exists in the dbsUrl provided
    
    """
    
    datasetDetails = DatasetConventions.parseDatasetPath(datasetPath)
    for key in ['Primary', 'DataTier', 'Processed']:
        if datasetDetails[key] == None:
            msg = "Invalid Dataset Name: \n ==> %s\n" % datasetPath
            msg += "Does not contain %s information" % key
            raise WorkflowMakerError(msg)
                

    datasets = []
    try:
        reader = DBSReader(dbsUrl)
        datasets = reader.matchProcessedDatasets(
            datasetDetails['Primary'],
            datasetDetails['DataTier'],
            datasetDetails['Processed'])
    except Exception, ex:
        msg = "Error calling DBS to validate dataset:\n%s\n" % datasetPath
        msg += str(ex)
        raise WorkflowMakerError(msg)
Пример #2
0
    def addInputDataset(self, datasetPath):
        """
        _addInputDataset_

        If this workflow processes a dataset, set that here

        NOTE: Is possible to also specify
            - Split Type (file or event)
            - Split Size (int)
            - input DBS
        Not sure how many of these we want to use.
        For now, they can be added to the inputDataset dictionary
        """
        datasetBits = DatasetConventions.parseDatasetPath(datasetPath)
        self.inputDataset.update(datasetBits)
        self.inputDataset['IsUsed'] = True
        self.inputDataset['DatasetName'] = datasetPath
        
        return
Пример #3
0
    def addInputDataset(self, datasetPath):
        """
        _addInputDataset_

        If this workflow processes a dataset, set that here

        NOTE: Is possible to also specify
            - Split Type (file or event)
            - Split Size (int)
            - input DBS
        Not sure how many of these we want to use.
        For now, they can be added to the inputDataset dictionary
        """
        datasetBits = DatasetConventions.parseDatasetPath(datasetPath)
        self.inputDataset.update(datasetBits)
        self.inputDataset['IsUsed'] = True
        self.inputDataset['DatasetName'] = datasetPath

        return
Пример #4
0
    def addPileupDataset(self, datasetName, filesPerJob=10, targetModule=None):
        """
        _addPileupDataset_

        Add a dataset to provide pileup overlap.
        filesPerJob should be 1 in 99.9 % of cases

        """
        pileupDataset = {}
        pileupDataset['Primary'] = None
        pileupDataset['Processed'] = None
        pileupDataset['DataTier'] = None
        datasetBits = DatasetConventions.parseDatasetPath(datasetName)
        pileupDataset.update(datasetBits)
        pileupDataset['FilesPerJob'] = filesPerJob
        # Target module coould be 'MixingModule' or 'DataMixingModule' for
        # the moment. If None, MixingModule will be used.
        pileupDataset['TargetModule'] = targetModule
        self.pileupDatasets.append(pileupDataset)
        return
Пример #5
0
    def addPileupDataset(self, datasetName, filesPerJob = 10,
            targetModule=None):
        """
        _addPileupDataset_

        Add a dataset to provide pileup overlap.
        filesPerJob should be 1 in 99.9 % of cases

        """
        pileupDataset = {}
        pileupDataset['Primary'] = None
        pileupDataset['Processed'] = None
        pileupDataset['DataTier'] = None
        datasetBits = DatasetConventions.parseDatasetPath(datasetName)
        pileupDataset.update(datasetBits)
        pileupDataset['FilesPerJob'] = filesPerJob
        # Target module coould be 'MixingModule' or 'DataMixingModule' for
        # the moment. If None, MixingModule will be used.
        pileupDataset['TargetModule'] = targetModule
        self.pileupDatasets.append(pileupDataset)
        return
Пример #6
0
    msg = "--split-size option not provided: This is required"
    raise RuntimeError, msg

try:
    splitSize = int(splitSize)
except ValueError, ex:
    msg = "--split-size argument is not an integer: %s\n" % splitSize
    raise RuntimeError, msg

#channel0 = DatasetConventions.parseDatasetPath(dataset)['Primary']

if channel == None:
    #  //
    # // Assume same as input
    #//
    channel = DatasetConventions.parseDatasetPath(dataset)['Primary']
    



#  //
# // Checking arguments against naming conventions
#//
if not (re.findall("^v[0-9]+$", processingVersion)):
    msg = "processing_version '" + processingVersion + \
        "' violates naming conventions!\n" \
        "Processing version should match this regexp ^v[0-9]+$ " \
        "(see https://twiki.cern.ch/twiki/bin/view/CMS/DMWMPG_PrimaryDatasets)"
    raise RuntimeError, msg

if re.findall("[-]+", acquisitionEra):
Пример #7
0
def createMergeJobWorkflow(procSpec, isFastMerge = True, doCleanUp = True, littleE = False):
    """
    _createMergeJobWorkflow_

    Given a Processing Workflow, generate a set of Merge Job
    workflows that can be used to generate actual merge jobs 
    (as opposed to creating datasets like createMergeDatasetWorkflow)

    returns a dictionary of (input, IE MergeSensor watched) dataset name
    to workflow spec instances

    """
    mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge)
    mergeDatasets = mergeDatasetWF.outputDatasets()

    results = {}

    procSpecName = procSpec.workflowName()
    

    for dataset in mergeDatasets:
        inputDataset = dataset['ParentDataset']

        newWF = WorkflowSpec()
        newWF.parameters.update(procSpec.parameters)
        newWF.setWorkflowName(procSpecName)
        newWF.parameters['WorkflowType'] = "Merge"
        

        cmsRunNode = newWF.payload
        cmsRunNode.name = "cmsRun1"
        cmsRunNode.type = "CMSSW"
        cmsRunNode.application["Project"] = "CMSSW"
        cmsRunNode.application["Version"] = dataset['ApplicationVersion']
        cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323"

        #  //
        # // Hack to forward UserSandbox to Merge Jobs
        #//
        userSandbox = dataset.get("UserSandbox", None)
        if userSandbox != None:
            cmsRunNode.userSandbox = userSandbox

        #if isFastMerge == True:
        #    if littleE:
        #        cmsRunNode.application["Executable"] = "edmFastMerge"
        #    else:
        #        cmsRunNode.application["Executable"] = _FastMergeBinary
        #    outputModuleName = "EdmFastMerge"
        #else:
        cmsRunNode.application["Executable"] = "cmsRun"
        outputModuleName = "Merged"

        #  //
        # // Input Dataset
        #//
        datasetBits = DatasetConventions.parseDatasetPath(inputDataset)
        inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'],
                                               datasetBits['Processed'])
        inDataset["DataTier"] = datasetBits['DataTier']

        #  //
        # // Output Dataset
        #//
        
        outputDataset = cmsRunNode.addOutputDataset(
            dataset['PrimaryDataset'], 
            dataset['ProcessedDataset'], 
            outputModuleName)

        outputDataset["DataTier"] = dataset['DataTier']
        outputDataset["PSetHash"] = dataset['PSetHash']

        outputDataset["ApplicationName"] = \
                    cmsRunNode.application["Executable"]
        outputDataset["ApplicationProject"] = \
                    cmsRunNode.application["Project"]
        outputDataset["ApplicationVersion"] = \
                    cmsRunNode.application["Version"]
        outputDataset["ApplicationFamily"] = outputModuleName
        outputDataset["PhysicsGroup"] = \
                      procSpec.parameters.get('PhysicsGroup', None)
        outputDataset['ParentDataset'] = inputDataset
                
        
        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1")
        if doCleanUp == True:
            WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1")

        #  //
        # // Add log archive node
        #//
        WorkflowTools.addLogArchNode(cmsRunNode, "logArchive")

        WorkflowTools.generateFilenames(newWF)

        
        results[inputDataset] = newWF

    return results
Пример #8
0
def createHarvestingWorkflow(dataset, site, cmsPath, scramArch,
                             cmsswVersion, globalTag, configFile = None,
                             DQMServer = None, proxyLocation = None, 
                             DQMCopyToCERN = None, runNumber = None,
                             doStageOut = None):
    """
    _createHarvestingWorkflow_

    Create a Harvesting workflow to extract DQM information from
    a dataset

    Enters an essentially empty process that will be updated
    at runtime to use the harvesting cfg from the release.

    """

    datasetPieces = DatasetConventions.parseDatasetPath(dataset)

    physicsGroup = "OfflineDQM"
    category = "DQM"
    
    if runNumber == None:
        requestId = "OfflineDQM"
        label = "%s-%s-%s" % (datasetPieces['Primary'], datasetPieces['Processed'],
                          datasetPieces['DataTier'])
        channel = "DQMHarvest"
    else:
        requestId = "%s-%s" % (datasetPieces["Primary"], datasetPieces["DataTier"])
        label = "DQMHarvesting"
        channel = "Run%s" % runNumber

    logging.debug("path, arch, ver: %s, %s, %s" % (cmsPath, scramArch, cmsswVersion))

    if configFile != None:
        cfgWrapper = configFromFile(cmsPath, scramArch,
                                    cmsswVersion, configFile)
    else:
        cfgWrapper = configOnFly(cmsPath, scramArch,
                                 cmsswVersion)
        
    #  //
    # // Pass in global tag
    #//
    cfgWrapper.conditionsTag = globalTag


    maker = WorkflowMaker(requestId, channel, label )
    maker.setCMSSWVersion(cmsswVersion)
    maker.setPhysicsGroup(physicsGroup)
    maker.setConfiguration(cfgWrapper, Type = "instance")
    maker.changeCategory(category)
    maker.setPSetHash("NO_HASH")
    maker.addInputDataset(dataset)
    maker.setActivity('harvesting')

    spec = maker.makeWorkflow()
    spec.parameters['WorkflowType'] = "Harvesting"
    spec.parameters['DBSURL'] = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
    spec.parameters['OnlySites'] = site
    if DQMServer != None :
        spec.parameters['DQMServer'] = DQMServer
    if proxyLocation != None :
        spec.parameters['proxyLocation'] = proxyLocation
    if DQMCopyToCERN != None :
        spec.parameters['DQMCopyToCERN'] = DQMCopyToCERN
    if doStageOut is not None:
        spec.parameters['DoStageOut'] = doStageOut

    spec.payload.scriptControls['PostTask'].append(
        "JobCreator.RuntimeTools.RuntimeOfflineDQM")

    if configFile == None:
        preExecScript = spec.payload.scriptControls["PreExe"]
        preExecScript.append("JobCreator.RuntimeTools.RuntimeOfflineDQMSetup")


    return spec
Пример #9
0
def createMergeJobWorkflow(procSpec,
                           isFastMerge=True,
                           doCleanUp=True,
                           littleE=False):
    """
    _createMergeJobWorkflow_

    Given a Processing Workflow, generate a set of Merge Job
    workflows that can be used to generate actual merge jobs 
    (as opposed to creating datasets like createMergeDatasetWorkflow)

    returns a dictionary of (input, IE MergeSensor watched) dataset name
    to workflow spec instances

    """
    mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge)
    mergeDatasets = mergeDatasetWF.outputDatasets()

    results = {}

    procSpecName = procSpec.workflowName()

    for dataset in mergeDatasets:
        inputDataset = dataset['ParentDataset']

        newWF = WorkflowSpec()
        newWF.parameters.update(procSpec.parameters)
        newWF.setWorkflowName(procSpecName)
        newWF.parameters['WorkflowType'] = "Merge"

        cmsRunNode = newWF.payload
        cmsRunNode.name = "cmsRun1"
        cmsRunNode.type = "CMSSW"
        cmsRunNode.application["Project"] = "CMSSW"
        cmsRunNode.application["Version"] = dataset['ApplicationVersion']
        cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323"

        #  //
        # // Hack to forward UserSandbox to Merge Jobs
        #//
        userSandbox = dataset.get("UserSandbox", None)
        if userSandbox != None:
            cmsRunNode.userSandbox = userSandbox

        #if isFastMerge == True:
        #    if littleE:
        #        cmsRunNode.application["Executable"] = "edmFastMerge"
        #    else:
        #        cmsRunNode.application["Executable"] = _FastMergeBinary
        #    outputModuleName = "EdmFastMerge"
        #else:
        cmsRunNode.application["Executable"] = "cmsRun"
        outputModuleName = "Merged"

        #  //
        # // Input Dataset
        #//
        datasetBits = DatasetConventions.parseDatasetPath(inputDataset)
        inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'],
                                               datasetBits['Processed'])
        inDataset["DataTier"] = datasetBits['DataTier']

        #  //
        # // Output Dataset
        #//

        outputDataset = cmsRunNode.addOutputDataset(
            dataset['PrimaryDataset'], dataset['ProcessedDataset'],
            outputModuleName)

        outputDataset["DataTier"] = dataset['DataTier']
        outputDataset["PSetHash"] = dataset['PSetHash']

        outputDataset["ApplicationName"] = \
                    cmsRunNode.application["Executable"]
        outputDataset["ApplicationProject"] = \
                    cmsRunNode.application["Project"]
        outputDataset["ApplicationVersion"] = \
                    cmsRunNode.application["Version"]
        outputDataset["ApplicationFamily"] = outputModuleName
        outputDataset["PhysicsGroup"] = \
                      procSpec.parameters.get('PhysicsGroup', None)
        outputDataset['ParentDataset'] = inputDataset

        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1")
        if doCleanUp == True:
            WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1")

        #  //
        # // Add log archive node
        #//
        WorkflowTools.addLogArchNode(cmsRunNode, "logArchive")

        WorkflowTools.generateFilenames(newWF)

        results[inputDataset] = newWF

    return results