def _getSubmissionJobs(self, maxsample): # Get list of submittable jobs readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY)) retryOK = readyList defaultJob = Job() if self._job_retries >= 0: retryOK = lfilter(lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self._job_retries, readyList) modOK = lfilter(self._task.canSubmit, readyList) jobList = set.intersection(set(retryOK), set(modOK)) if self._showBlocker and readyList and not jobList: # No submission but ready jobs err = [] err += utils.QM((len(retryOK) > 0) and (len(modOK) == 0), [], ['have hit their maximum number of retries']) err += utils.QM((len(retryOK) == 0) and (len(modOK) > 0), [], ['are vetoed by the task module']) self._log_user_time.warning('All remaining jobs %s!', str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err)) self._showBlocker = not (len(readyList) > 0 and len(jobList) == 0) # Determine number of jobs to submit submit = len(jobList) if self._njobs_inqueue > 0: submit = min(submit, self._njobs_inqueue - self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS))) if self._njobs_inflight > 0: submit = min(submit, self._njobs_inflight - self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))) if self._continuous and (maxsample > 0): submit = min(submit, maxsample) submit = max(submit, 0) if self._do_shuffle: return self._sample(jobList, submit) return sorted(jobList)[:submit]
def getSubmissionJobs(self, maxsample, static = {'showBlocker': True}): # Get list of submittable jobs readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY)) retryOK = readyList defaultJob = Job() if self.maxRetry >= 0: retryOK = filter(lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self.maxRetry, readyList) modOK = filter(self._task.canSubmit, readyList) jobList = set.intersection(set(retryOK), set(modOK)) if static['showBlocker'] and len(readyList) > 0 and len(jobList) == 0: # No submission but ready jobs err = [] err += utils.QM(len(retryOK) > 0 and len(modOK) == 0, [], ['have hit their maximum number of retries']) err += utils.QM(len(retryOK) == 0 and len(modOK) > 0, [], ['are vetoed by the task module']) utils.vprint('All remaining jobs %s!' % str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err), -1, True) static['showBlocker'] = not (len(readyList) > 0 and len(jobList) == 0) # Determine number of jobs to submit submit = len(jobList) if self.inQueue > 0: submit = min(submit, self.inQueue - self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS))) if self.inFlight > 0: submit = min(submit, self.inFlight - self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))) if self.continuous: submit = min(submit, maxsample) submit = max(submit, 0) if self.doShuffle: return self.sample(jobList, submit) else: return sorted(jobList)[:submit]
def _getSubmissionJobs(self, maxsample): # Get list of submittable jobs readyList = self.jobDB.getJobs(ClassSelector(JobClass.READY)) retryOK = readyList defaultJob = Job() if self._job_retries >= 0: retryOK = lfilter( lambda x: self.jobDB.get(x, defaultJob).attempt - 1 < self. _job_retries, readyList) modOK = lfilter(self._task.canSubmit, readyList) jobList = set.intersection(set(retryOK), set(modOK)) if self._showBlocker and readyList and not jobList: # No submission but ready jobs err = [] err += utils.QM((len(retryOK) > 0) and (len(modOK) == 0), [], ['have hit their maximum number of retries']) err += utils.QM((len(retryOK) == 0) and (len(modOK) > 0), [], ['are vetoed by the task module']) self._log_user_time.warning( 'All remaining jobs %s!', str.join(utils.QM(retryOK or modOK, ' or ', ' and '), err)) self._showBlocker = not (len(readyList) > 0 and len(jobList) == 0) # Determine number of jobs to submit submit = len(jobList) if self._njobs_inqueue > 0: submit = min( submit, self._njobs_inqueue - self.jobDB.getJobsN(ClassSelector(JobClass.ATWMS))) if self._njobs_inflight > 0: submit = min( submit, self._njobs_inflight - self.jobDB.getJobsN(ClassSelector(JobClass.PROCESSING))) if self._chunks_enabled and (maxsample > 0): submit = min(submit, maxsample) submit = max(submit, 0) if self._do_shuffle: return self._sample(jobList, submit) return sorted(jobList)[:submit]
def main(): usage = '%s [OPTIONS] <config file / work directory>' % sys.argv[0] parser = optparse.OptionParser(usage=usage) parser.add_option('-G', '--globaltag', dest='globaltag', default='crab2_tag', help='Specify global tag') parser.add_option('-F', '--input', dest='inputFile', default=None, help='Specify dbs input file to use instead of scanning job output') # parser.add_option('-k', '--key-select', dest='dataset key select', default='', # help='Specify dataset keys to process') parser.add_option('-c', '--continue-migration', dest='continue_migration', default=False, action='store_true', help='Continue an already started migration') ogDiscover = optparse.OptionGroup(parser, 'Discovery options - ignored in case dbs input file is specified', '') ogDiscover.add_option('-n', '--name', dest='dataset name pattern', default='', help='Specify dbs path name - Example: DataSet_@NICK@_@VAR@') ogDiscover.add_option('-T', '--datatype', dest='datatype', default=None, help='Supply dataset type in case cmssw report did not specify it - valid values: "mc" or "data"') ogDiscover.add_option('-m', '--merge', dest='merge parents', default=False, action='store_true', help='Merge output files from different parent blocks into a single block [Default: Keep boundaries]') ogDiscover.add_option('-j', '--jobhash', dest='useJobHash', default=False, action='store_true', help='Use hash of all config files in job for dataset key calculation') ogDiscover.add_option('-u', '--unique-cfg', dest='uniqueCfg', default=False, action='store_true', help='Circumvent edmConfigHash collisions so each dataset is stored with unique config information') ogDiscover.add_option('-P', '--parent', dest='parent source', default='', help='Override parent information source - to bootstrap a reprocessing on local files') ogDiscover.add_option('-H', '--hash-keys', dest='dataset hash keys', default='', help='Included additional variables in dataset hash calculation') parser.add_option_group(ogDiscover) ogDiscover2 = optparse.OptionGroup(parser, 'Discovery options II - only available when config file is used', '') ogDiscover2.add_option('-J', '--job-selector', dest='selected', default=None, help='Specify dataset(s) to process') parser.add_option_group(ogDiscover2) ogMode = optparse.OptionGroup(parser, 'Processing mode', '') ogMode.add_option('-b', '--batch', dest='batch', default=False, action='store_true', help='Enable non-interactive batch mode [Default: Interactive mode]') ogMode.add_option('-d', '--discovery', dest='discovery', default=False, action='store_true', help='Enable discovery mode - just collect file information and exit') ogMode.add_option('', '--tempdir', dest='tmpDir', default='', help='Override temp directory') ogMode.add_option('-i', '--no-import', dest='doImport', default=True, action='store_false', help='Disable import of new datasets into target DBS instance - only temporary xml files are created, ' + 'which can be added later via datasetDBSTool.py [Default: Import datasets]') parser.add_option_group(ogMode) ogInc = optparse.OptionGroup(parser, 'Incremental adding of files to DBS', '') ogInc.add_option('-I', '--incremental', dest='incremental', default=False, action='store_true', help='Skip import of existing files - Warning: this destroys coherent block structure!') # ogInc.add_option('-o', '--open-blocks', dest='closeBlock', default=True, action='store_false', # help='Keep blocks open for addition of further files [Default: Close blocks]') parser.add_option_group(ogInc) ogInst = optparse.OptionGroup(parser, 'DBS instance handling', '') ogInst.add_option('-t', '--target-instance', dest='dbsTarget', default='https://cmsweb.cern.ch/dbs/prod/phys03', help='Specify target dbs instance url') ogInst.add_option('-s', '--source-instance', dest='dbsSource', default='https://cmsweb.cern.ch/dbs/prod/global', help='Specify source dbs instance url(s), where parent datasets are taken from') parser.add_option_group(ogInst) ogDbg = optparse.OptionGroup(parser, 'Display options', '') ogDbg.add_option('-D', '--display-dataset', dest='display_data', default=None, help='Display information associated with dataset key(s) (accepts "all")') ogDbg.add_option('-C', '--display-config', dest='display_cfg', default=None, help='Display information associated with config hash(es) (accepts "all")') ogDbg.add_option('-v', '--verbose', dest='verbosity', default=0, action='count', help='Increase verbosity') parser.add_option_group(ogDbg) (opts, args) = parser.parse_args() utils.verbosity(opts.verbosity) setattr(opts, 'include parent infos', True) setattr(opts, 'importLumi', True) setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys').replace(',', ' ')) if opts.useJobHash: setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys') + ' CMSSW_CONFIG_JOBHASH') # 0) Get work directory, create dbs dump directory if len(args) != 1: utils.exitWithUsage(usage, 'Neither work directory nor config file specified!') if os.path.isdir(args[0]): opts.workDir = os.path.abspath(os.path.normpath(args[0])) else: opts.workDir = getConfig(configFile=args[0]).getWorkPath() if not opts.tmpDir: opts.tmpDir = os.path.join(opts.workDir, 'dbs') if not os.path.exists(opts.tmpDir): os.mkdir(opts.tmpDir) # Lock file in case several instances of this program are running mutex = FileMutex(os.path.join(opts.tmpDir, 'datasetDBSAdd.lock')) # 1) Get dataset information if opts.inputFile: provider = DataProvider.getInstance('ListProvider', getConfig(), opts.inputFile, None) else: config = getConfig(configDict = {'dataset': dict(parser.values.__dict__)}) if opts.discovery: config.set('dataset name pattern', '@DS_KEY@') provider = DataProvider.getInstance('DBSInfoProvider', config, args[0], None) provider.saveState(os.path.join(opts.tmpDir, 'dbs.dat')) if opts.discovery: sys.exit(os.EX_OK) blocks = provider.getBlocks() # 2) Filter datasets if opts.incremental: # Query target DBS for all found datasets and perform dataset resync with "supposed" state dNames = set(map(lambda b: b[DataProvider.Dataset], blocks)) dNames = filter(lambda ds: hasDataset(opts.dbsTarget, ds), dNames) config = getConfig(configDict = {None: {'dbs instance': opts.dbsTarget}}) oldBlocks = reduce(operator.add, map(lambda ds: DBSApiv2(config, None, ds, None).getBlocks(), dNames), []) (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldBlocks, blocks) if len(blocksMissing) or len(blocksChanged): if not utils.getUserBool(' * WARNING: Block structure has changed! Continue?', False): sys.exit(os.EX_OK) # Search for blocks which were partially added and generate "pseudo"-blocks with left over files setOldBlocks = set(map(lambda x: x[DataProvider.BlockName], oldBlocks)) setAddedBlocks = set(map(lambda x: x[DataProvider.BlockName], blocksAdded)) blockCollision = set.intersection(setOldBlocks, setAddedBlocks) if blockCollision and opts.closeBlock: # Block are closed and contents have changed for block in blocksAdded: if block[DataProvider.BlockName] in blockCollision: block[DataProvider.BlockName] = utils.strGuid(md5(str(time.time())).hexdigest()) blocks = blocksAdded # 3) Display dataset properties if opts.display_data or opts.display_cfg: raise APIError('Not yet reimplemented') #set-up logging logging.basicConfig(format='%(levelname)s: %(message)s') logger = logging.getLogger('dbs3-migration') logger.addHandler(NullHandler()) logger.setLevel(logging.DEBUG) #set-up dbs clients dbs3_target_client = DBS3LiteClient(url=opts.dbsTarget) dbs3_source_client = DBS3LiteClient(url=opts.dbsSource) dbs3_migration_queue = DBS3MigrationQueue() for blockDump in generateDBS3BlockDumps(opts, blocks): if not opts.continue_migration: ###initiate the dbs3 to dbs3 migration of parent blocks logger.debug('Checking parentage for block: %s' % blockDump['block']['block_name']) unique_parent_lfns = set((parent[u'parent_logical_file_name'] for parent in blockDump[u'file_parent_list'])) unique_blocks = set((block['block_name'] for parent_lfn in unique_parent_lfns for block in dbs3_source_client.listBlocks(logical_file_name=parent_lfn))) for block_to_migrate in unique_blocks: if dbs3_target_client.listBlocks(block_name=block_to_migrate): #block already at destination logger.debug('Block %s is already at destination' % block_to_migrate) continue migration_task = MigrationTask(block_name=block_to_migrate, migration_url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader', dbs_client=dbs3_target_client) try: dbs3_migration_queue.add_migration_task(migration_task) except AlreadyQueued as aq: logger.debug(aq.message) dbs3_migration_queue.save_to_disk(os.path.join(opts.tmpDir, 'dbs3_migration.pkl')) else: try: dbs3_migration_queue = DBS3MigrationQueue.read_from_disk(os.path.join(opts.tmpDir, 'dbs3_migration.pkl')) except IOError as io_err: msg = "Probably, there is no DBS 3 migration for this dataset ongoing, Dude!" logger.exception('%s\n%s' % (io_err.message, msg)) raise #wait for all parent blocks migrated to dbs3 do_migration(dbs3_migration_queue) #insert block into dbs3 dbs3_target_client.insertBulkBlock(blockDump)