def validateDataset( datasetPath, dbsUrl): """ _validateDataset_ Util method to check that the datasetPath provided exists in the dbsUrl provided """ datasetDetails = DatasetConventions.parseDatasetPath(datasetPath) for key in ['Primary', 'DataTier', 'Processed']: if datasetDetails[key] == None: msg = "Invalid Dataset Name: \n ==> %s\n" % datasetPath msg += "Does not contain %s information" % key raise WorkflowMakerError(msg) datasets = [] try: reader = DBSReader(dbsUrl) datasets = reader.matchProcessedDatasets( datasetDetails['Primary'], datasetDetails['DataTier'], datasetDetails['Processed']) except Exception, ex: msg = "Error calling DBS to validate dataset:\n%s\n" % datasetPath msg += str(ex) raise WorkflowMakerError(msg)
def addInputDataset(self, datasetPath): """ _addInputDataset_ If this workflow processes a dataset, set that here NOTE: Is possible to also specify - Split Type (file or event) - Split Size (int) - input DBS Not sure how many of these we want to use. For now, they can be added to the inputDataset dictionary """ datasetBits = DatasetConventions.parseDatasetPath(datasetPath) self.inputDataset.update(datasetBits) self.inputDataset['IsUsed'] = True self.inputDataset['DatasetName'] = datasetPath return
def addPileupDataset(self, datasetName, filesPerJob=10, targetModule=None): """ _addPileupDataset_ Add a dataset to provide pileup overlap. filesPerJob should be 1 in 99.9 % of cases """ pileupDataset = {} pileupDataset['Primary'] = None pileupDataset['Processed'] = None pileupDataset['DataTier'] = None datasetBits = DatasetConventions.parseDatasetPath(datasetName) pileupDataset.update(datasetBits) pileupDataset['FilesPerJob'] = filesPerJob # Target module coould be 'MixingModule' or 'DataMixingModule' for # the moment. If None, MixingModule will be used. pileupDataset['TargetModule'] = targetModule self.pileupDatasets.append(pileupDataset) return
def addPileupDataset(self, datasetName, filesPerJob = 10, targetModule=None): """ _addPileupDataset_ Add a dataset to provide pileup overlap. filesPerJob should be 1 in 99.9 % of cases """ pileupDataset = {} pileupDataset['Primary'] = None pileupDataset['Processed'] = None pileupDataset['DataTier'] = None datasetBits = DatasetConventions.parseDatasetPath(datasetName) pileupDataset.update(datasetBits) pileupDataset['FilesPerJob'] = filesPerJob # Target module coould be 'MixingModule' or 'DataMixingModule' for # the moment. If None, MixingModule will be used. pileupDataset['TargetModule'] = targetModule self.pileupDatasets.append(pileupDataset) return
msg = "--split-size option not provided: This is required" raise RuntimeError, msg try: splitSize = int(splitSize) except ValueError, ex: msg = "--split-size argument is not an integer: %s\n" % splitSize raise RuntimeError, msg #channel0 = DatasetConventions.parseDatasetPath(dataset)['Primary'] if channel == None: # // # // Assume same as input #// channel = DatasetConventions.parseDatasetPath(dataset)['Primary'] # // # // Checking arguments against naming conventions #// if not (re.findall("^v[0-9]+$", processingVersion)): msg = "processing_version '" + processingVersion + \ "' violates naming conventions!\n" \ "Processing version should match this regexp ^v[0-9]+$ " \ "(see https://twiki.cern.ch/twiki/bin/view/CMS/DMWMPG_PrimaryDatasets)" raise RuntimeError, msg if re.findall("[-]+", acquisitionEra):
def createMergeJobWorkflow(procSpec, isFastMerge = True, doCleanUp = True, littleE = False): """ _createMergeJobWorkflow_ Given a Processing Workflow, generate a set of Merge Job workflows that can be used to generate actual merge jobs (as opposed to creating datasets like createMergeDatasetWorkflow) returns a dictionary of (input, IE MergeSensor watched) dataset name to workflow spec instances """ mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge) mergeDatasets = mergeDatasetWF.outputDatasets() results = {} procSpecName = procSpec.workflowName() for dataset in mergeDatasets: inputDataset = dataset['ParentDataset'] newWF = WorkflowSpec() newWF.parameters.update(procSpec.parameters) newWF.setWorkflowName(procSpecName) newWF.parameters['WorkflowType'] = "Merge" cmsRunNode = newWF.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Project"] = "CMSSW" cmsRunNode.application["Version"] = dataset['ApplicationVersion'] cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323" # // # // Hack to forward UserSandbox to Merge Jobs #// userSandbox = dataset.get("UserSandbox", None) if userSandbox != None: cmsRunNode.userSandbox = userSandbox #if isFastMerge == True: # if littleE: # cmsRunNode.application["Executable"] = "edmFastMerge" # else: # cmsRunNode.application["Executable"] = _FastMergeBinary # outputModuleName = "EdmFastMerge" #else: cmsRunNode.application["Executable"] = "cmsRun" outputModuleName = "Merged" # // # // Input Dataset #// datasetBits = DatasetConventions.parseDatasetPath(inputDataset) inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'], datasetBits['Processed']) inDataset["DataTier"] = datasetBits['DataTier'] # // # // Output Dataset #// outputDataset = cmsRunNode.addOutputDataset( dataset['PrimaryDataset'], dataset['ProcessedDataset'], outputModuleName) outputDataset["DataTier"] = dataset['DataTier'] outputDataset["PSetHash"] = dataset['PSetHash'] outputDataset["ApplicationName"] = \ cmsRunNode.application["Executable"] outputDataset["ApplicationProject"] = \ cmsRunNode.application["Project"] outputDataset["ApplicationVersion"] = \ cmsRunNode.application["Version"] outputDataset["ApplicationFamily"] = outputModuleName outputDataset["PhysicsGroup"] = \ procSpec.parameters.get('PhysicsGroup', None) outputDataset['ParentDataset'] = inputDataset # // # // Add Stage Out node #// WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1") if doCleanUp == True: WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1") # // # // Add log archive node #// WorkflowTools.addLogArchNode(cmsRunNode, "logArchive") WorkflowTools.generateFilenames(newWF) results[inputDataset] = newWF return results
def createHarvestingWorkflow(dataset, site, cmsPath, scramArch, cmsswVersion, globalTag, configFile = None, DQMServer = None, proxyLocation = None, DQMCopyToCERN = None, runNumber = None, doStageOut = None): """ _createHarvestingWorkflow_ Create a Harvesting workflow to extract DQM information from a dataset Enters an essentially empty process that will be updated at runtime to use the harvesting cfg from the release. """ datasetPieces = DatasetConventions.parseDatasetPath(dataset) physicsGroup = "OfflineDQM" category = "DQM" if runNumber == None: requestId = "OfflineDQM" label = "%s-%s-%s" % (datasetPieces['Primary'], datasetPieces['Processed'], datasetPieces['DataTier']) channel = "DQMHarvest" else: requestId = "%s-%s" % (datasetPieces["Primary"], datasetPieces["DataTier"]) label = "DQMHarvesting" channel = "Run%s" % runNumber logging.debug("path, arch, ver: %s, %s, %s" % (cmsPath, scramArch, cmsswVersion)) if configFile != None: cfgWrapper = configFromFile(cmsPath, scramArch, cmsswVersion, configFile) else: cfgWrapper = configOnFly(cmsPath, scramArch, cmsswVersion) # // # // Pass in global tag #// cfgWrapper.conditionsTag = globalTag maker = WorkflowMaker(requestId, channel, label ) maker.setCMSSWVersion(cmsswVersion) maker.setPhysicsGroup(physicsGroup) maker.setConfiguration(cfgWrapper, Type = "instance") maker.changeCategory(category) maker.setPSetHash("NO_HASH") maker.addInputDataset(dataset) maker.setActivity('harvesting') spec = maker.makeWorkflow() spec.parameters['WorkflowType'] = "Harvesting" spec.parameters['DBSURL'] = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" spec.parameters['OnlySites'] = site if DQMServer != None : spec.parameters['DQMServer'] = DQMServer if proxyLocation != None : spec.parameters['proxyLocation'] = proxyLocation if DQMCopyToCERN != None : spec.parameters['DQMCopyToCERN'] = DQMCopyToCERN if doStageOut is not None: spec.parameters['DoStageOut'] = doStageOut spec.payload.scriptControls['PostTask'].append( "JobCreator.RuntimeTools.RuntimeOfflineDQM") if configFile == None: preExecScript = spec.payload.scriptControls["PreExe"] preExecScript.append("JobCreator.RuntimeTools.RuntimeOfflineDQMSetup") return spec
def createMergeJobWorkflow(procSpec, isFastMerge=True, doCleanUp=True, littleE=False): """ _createMergeJobWorkflow_ Given a Processing Workflow, generate a set of Merge Job workflows that can be used to generate actual merge jobs (as opposed to creating datasets like createMergeDatasetWorkflow) returns a dictionary of (input, IE MergeSensor watched) dataset name to workflow spec instances """ mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge) mergeDatasets = mergeDatasetWF.outputDatasets() results = {} procSpecName = procSpec.workflowName() for dataset in mergeDatasets: inputDataset = dataset['ParentDataset'] newWF = WorkflowSpec() newWF.parameters.update(procSpec.parameters) newWF.setWorkflowName(procSpecName) newWF.parameters['WorkflowType'] = "Merge" cmsRunNode = newWF.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Project"] = "CMSSW" cmsRunNode.application["Version"] = dataset['ApplicationVersion'] cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323" # // # // Hack to forward UserSandbox to Merge Jobs #// userSandbox = dataset.get("UserSandbox", None) if userSandbox != None: cmsRunNode.userSandbox = userSandbox #if isFastMerge == True: # if littleE: # cmsRunNode.application["Executable"] = "edmFastMerge" # else: # cmsRunNode.application["Executable"] = _FastMergeBinary # outputModuleName = "EdmFastMerge" #else: cmsRunNode.application["Executable"] = "cmsRun" outputModuleName = "Merged" # // # // Input Dataset #// datasetBits = DatasetConventions.parseDatasetPath(inputDataset) inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'], datasetBits['Processed']) inDataset["DataTier"] = datasetBits['DataTier'] # // # // Output Dataset #// outputDataset = cmsRunNode.addOutputDataset( dataset['PrimaryDataset'], dataset['ProcessedDataset'], outputModuleName) outputDataset["DataTier"] = dataset['DataTier'] outputDataset["PSetHash"] = dataset['PSetHash'] outputDataset["ApplicationName"] = \ cmsRunNode.application["Executable"] outputDataset["ApplicationProject"] = \ cmsRunNode.application["Project"] outputDataset["ApplicationVersion"] = \ cmsRunNode.application["Version"] outputDataset["ApplicationFamily"] = outputModuleName outputDataset["PhysicsGroup"] = \ procSpec.parameters.get('PhysicsGroup', None) outputDataset['ParentDataset'] = inputDataset # // # // Add Stage Out node #// WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1") if doCleanUp == True: WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1") # // # // Add log archive node #// WorkflowTools.addLogArchNode(cmsRunNode, "logArchive") WorkflowTools.generateFilenames(newWF) results[inputDataset] = newWF return results