Exemplo n.º 1
0
def dataset_show_diff(options):
	if len(options.args) != 2:
		options.parser.exit_with_usage(options.parser.usage('data'))

	provider_a = DataProvider.load_from_file(options.args[0])
	provider_b = DataProvider.load_from_file(options.args[1])
	block_resync_tuple = DataProvider.resync_blocks(provider_a.get_block_list_cached(show_stats=False),
		provider_b.get_block_list_cached(show_stats=False))
	(block_list_added, block_list_missing, block_list_matching) = block_resync_tuple

	def _dataset_iter_matching_blocks():
		for (block_old, block_new, _, _) in block_list_matching:
			def _format_change(old, new):
				if old != new:
					return '%s -> %s' % (old, new)
				return old
			block_old[DataProvider.NFiles] = _format_change(len(block_old.get(DataProvider.FileList, [])),
				len(block_new.get(DataProvider.FileList, [])))
			block_old[DataProvider.NEntries] = _format_change(block_old[DataProvider.NEntries],
				block_new[DataProvider.NEntries])
			yield block_old

	header_list = [(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'),
		(DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries')]
	if block_list_added:
		ConsoleTable.create(header_list, dataset_iter_blocks(block_list_added), title='Added blocks')
	if block_list_missing:
		ConsoleTable.create(header_list, dataset_iter_blocks(block_list_missing), title='Removed blocks')
	if block_list_matching:
		ConsoleTable.create(header_list, _dataset_iter_matching_blocks(), title='Matching blocks')
Exemplo n.º 2
0
def discover_blocks(options):
    # Get work directory, create dbs dump directory
    if os.path.isdir(options.args[0]):
        work_dn = os.path.abspath(os.path.normpath(options.args[0]))
    else:
        work_dn = gc_create_config(config_file=options.args[0]).get_work_path()
    if not options.opts.tempdir:
        options.opts.tempdir = os.path.join(work_dn, 'dbs')
    if not os.path.exists(options.opts.tempdir):
        os.mkdir(options.opts.tempdir)

    # get provider with dataset information
    config = gc_create_config(config_dict={'dataset': options.config_dict},
                              load_old_config=False)
    if options.opts.input_file:
        provider = DataProvider.create_instance('ListProvider', config,
                                                'dataset',
                                                options.opts.input_file)
    else:
        provider = DataProvider.create_instance('DBSInfoProvider', config,
                                                'dataset', options.args[0])

    blocks = provider.get_block_list_cached(show_stats=False)
    DataProvider.save_to_file(os.path.join(options.opts.tempdir, 'dbs.dat'),
                              blocks)
    if options.opts.discovery:
        sys.exit(os.EX_OK)
    return blocks
Exemplo n.º 3
0
	def __init__(self, config, datasetExpr, datasetNick = None):
		self._changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger)
		config.set('phedex sites matcher mode', 'shell', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger)
		self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		(self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#')
		instance_default = config.get('dbs instance', '', onChange = self._changeTrigger)
		self._datasetInstance = self._datasetInstance or instance_default
		if not self._datasetInstance:
			self._datasetInstance = 'prod/global'
		elif '/' not in self._datasetInstance:
			self._datasetInstance = 'prod/%s' % self._datasetInstance
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger)
Exemplo n.º 4
0
def discover_blocks(options):
    # Get work directory, create dbs dump directory
    if os.path.isdir(options.args[0]):
        workDir = os.path.abspath(os.path.normpath(options.args[0]))
    else:
        workDir = getConfig(configFile=options.args[0]).getWorkPath()
    if not options.opts.tempdir:
        options.opts.tempdir = os.path.join(workDir, 'dbs')
    if not os.path.exists(options.opts.tempdir):
        os.mkdir(options.opts.tempdir)

    # get provider with dataset information
    if options.opts.input_file:
        provider = DataProvider.createInstance('ListProvider', getConfig(),
                                               options.opts.input_file, None)
    else:
        config = getConfig(configDict={'dataset': options.config_dict})
        provider = DataProvider.createInstance('DBSInfoProvider', config,
                                               options.args[0], None)

    blocks = provider.getBlocks(show_stats=False)
    DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'),
                            blocks)
    if options.opts.discovery:
        sys.exit(os.EX_OK)
    return blocks
Exemplo n.º 5
0
	def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, repository, keepOld = True):
		LimitedResyncParameterSource.__init__(self)
		(self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \
			(dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld)
		repository['dataset:%s' % srcName] = self
		self.resyncSetup(interval = -1)

		if not dataProvider: # debug mode - used by scripts - disables resync
			self._maxN = self._data_splitter.getMaxJobs()
			return

		# look for aborted resyncs - and try to restore old state if possible
		if self._existsDataPath('cache.dat.resync') and self._existsDataPath('map.tar.resync'):
			utils.renameFile(self._getDataPath('cache.dat.resync'), self._getDataPath('cache.dat'))
			utils.renameFile(self._getDataPath('map.tar.resync'), self._getDataPath('map.tar'))
		elif self._existsDataPath('cache.dat.resync') or self._existsDataPath('map.tar.resync'):
			raise DatasetError('Found broken resync state')

		if self._existsDataPath('cache.dat') and self._existsDataPath('map.tar'):
			self._data_splitter.importPartitions(self._getDataPath('map.tar'))
		else:
			DataProvider.saveToFile(self._getDataPath('cache.dat'), self._data_provider.getBlocks(show_stats = False))
			self._data_splitter.splitDataset(self._getDataPath('map.tar'), self._data_provider.getBlocks(show_stats = False))

		self._maxN = self._data_splitter.getMaxJobs()
Exemplo n.º 6
0
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters']))
		self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name],
			default={}, parser=parse_lumi_filter, strfun=str_lumi)
		if not self._lumi_filter.empty():
			config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = dataset_config.get_bool(
			['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty())
		config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter')
		self._only_complete = dataset_config.get_bool('only complete sites', True)
		self._only_valid = dataset_config.get_bool('only valid', True)
		self._location_format = dataset_config.get_enum('location format',
			CMSLocationFormat, CMSLocationFormat.hostname)
		self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		dataset_expr_parts = split_opt(dataset_expr, '@#')
		(self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts
		instance_default = dataset_config.get('dbs instance', '')
		self._dataset_instance = self._dataset_instance or instance_default
		if not self._dataset_instance:
			self._dataset_instance = 'prod/global'
		elif '/' not in self._dataset_instance:
			self._dataset_instance = 'prod/%s' % self._dataset_instance
		self._dataset_block_selector = self._dataset_block_selector or 'all'
Exemplo n.º 7
0
    def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0):
        self._lumi_filter = parseLumiFilter(config.get('lumi filter', ''))
        if self._lumi_filter:
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick,
                              datasetID)
        # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
        self._lumi_query = config.getBool('lumi metadata',
                                          self._lumi_filter != [])
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-T3_US_FNALLPC',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='weak')
        self._phedexT1Filter = config.getFilter('phedex t1 accept',
                                                'T1_DE_KIT T1_US_FNAL',
                                                defaultMatcher='blackwhite',
                                                defaultFilter='weak')
        self._phedexT1Mode = config.get('phedex t1 mode', 'disk').lower()
        self.onlyComplete = config.getBool('only complete sites', True)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname)

        (self._datasetPath, self._url,
         self._datasetBlock) = utils.optSplit(datasetExpr, '@#')
        self._url = self._url or config.get('dbs instance', '')
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid', True)
Exemplo n.º 8
0
def discover_blocks(options):
	# Get work directory, create dbs dump directory
	if os.path.isdir(options.args[0]):
		work_dn = os.path.abspath(os.path.normpath(options.args[0]))
	else:
		work_dn = gc_create_config(config_file=options.args[0]).get_work_path()
	if not options.opts.tempdir:
		options.opts.tempdir = os.path.join(work_dn, 'dbs')
	if not os.path.exists(options.opts.tempdir):
		os.mkdir(options.opts.tempdir)

	# get provider with dataset information
	config = gc_create_config(config_dict={'dataset': options.config_dict}, load_old_config=False)
	if options.opts.input_file:
		provider = DataProvider.create_instance('ListProvider',
			config, 'dataset', options.opts.input_file)
	else:
		provider = DataProvider.create_instance('DBSInfoProvider',
			config, 'dataset', options.args[0])

	blocks = provider.get_block_list_cached(show_stats=False)
	DataProvider.save_to_file(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks)
	if options.opts.discovery:
		sys.exit(os.EX_OK)
	return blocks
Exemplo n.º 9
0
def save_dataset(opts, provider):
	print('')
	blocks = provider.getBlocks()
	if opts.ordered:
		sort_inplace(blocks, key = itemgetter(DataProvider.Dataset, DataProvider.BlockName))
		for b in blocks:
			sort_inplace(b[DataProvider.FileList], key = itemgetter(DataProvider.URL))
	DataProvider.saveToFile(opts.save, blocks)
	print('Dataset information saved to ./%s' % opts.save)
Exemplo n.º 10
0
	def _check_lumi_filter(self, block, idx_runs, idx_lumi):
		lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector=False)
		if not lumi_filter:
			return
		if (self._lumi_strict == LumiMode.strict) and ((idx_runs is None) or (idx_lumi is None)):
			raise DatasetError('Strict lumi filter active but ' +
				'dataset %s does not provide lumi information!' % DataProvider.get_block_id(block))
		elif (self._lumi_strict == LumiMode.weak) and (idx_runs is None):
			raise DatasetError('Weak lumi filter active but ' +
				'dataset %s does not provide run information!' % DataProvider.get_block_id(block))
Exemplo n.º 11
0
	def _check_lumi_filter(self, block, idx_runs, idx_lumi):
		lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector=False)
		if not lumi_filter:
			return
		if (self._lumi_strict == LumiMode.strict) and ((idx_runs is None) or (idx_lumi is None)):
			raise DatasetError('Strict lumi filter active but ' +
				'dataset %s does not provide lumi information!' % DataProvider.get_block_id(block))
		elif (self._lumi_strict == LumiMode.weak) and (idx_runs is None):
			raise DatasetError('Weak lumi filter active but ' +
				'dataset %s does not provide run information!' % DataProvider.get_block_id(block))
Exemplo n.º 12
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._lumi_filter = dataset_config.get_lookup(
            ['lumi filter', '%s lumi filter' % datasource_name],
            default={},
            parser=parse_lumi_filter,
            strfun=str_lumi)
        if not self._lumi_filter.empty():
            config.set('%s processor' % datasource_name, 'LumiDataProcessor',
                       '+=')
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = dataset_config.get_bool(
            ['lumi metadata',
             '%s lumi metadata' % datasource_name],
            default=not self._lumi_filter.empty())
        config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedex_filter = dataset_config.get_filter(
            'phedex sites',
            '-* T1_*_Disk T2_* T3_*',
            default_matcher='BlackWhiteMatcher',
            default_filter='StrictListFilter')
        self._only_complete = dataset_config.get_bool('only complete sites',
                                                      True)
        self._only_valid = dataset_config.get_bool('only valid', True)
        self._allow_phedex = dataset_config.get_bool('allow phedex', True)
        self._location_format = dataset_config.get_enum(
            'location format', CMSLocationFormat, CMSLocationFormat.hostname)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        dataset_expr_parts = split_opt(dataset_expr, '@#')
        (self._dataset_path, self._dataset_instance,
         self._dataset_block_selector) = dataset_expr_parts
        instance_default = dataset_config.get('dbs instance', '')
        self._dataset_instance = self._dataset_instance or instance_default
        if not self._dataset_instance:
            self._dataset_instance = 'prod/global'
        elif '/' not in self._dataset_instance:
            self._dataset_instance = 'prod/%s' % self._dataset_instance
        self._dataset_block_selector = self._dataset_block_selector or 'all'
Exemplo n.º 13
0
	def __init__(self, dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld = True):
		ParameterSource.__init__(self)
		(self._dataDir, self._srcName, self._dataProvider, self._dataSplitter, self._part_proc) = \
			(dataDir, srcName, dataProvider, dataSplitter, dataProc)

		if not dataProvider:
			pass # debug mode - used by scripts - disables resync
		elif os.path.exists(self.getDataPath('cache.dat') and self.getDataPath('map.tar')):
			self._dataSplitter.importPartitions(self.getDataPath('map.tar'))
		else:
			DataProvider.saveToFile(self.getDataPath('cache.dat'), self._dataProvider.getBlocks(silent = False))
			self._dataSplitter.splitDataset(self.getDataPath('map.tar'), self._dataProvider.getBlocks())

		self._maxN = self._dataSplitter.getMaxJobs()
		self._keepOld = keepOld
Exemplo n.º 14
0
def create_dbs3_json_blocks(opts, dataset_blocks):
    dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks)
    for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter:
        dataset = block[DataProvider.Dataset]
        try:
            primary_dataset, processed_dataset, data_tier = dataset[1:].split(
                '/')
        except Exception:
            raise DatasetError('Dataset name %s is not a valid DBS name!' %
                               dataset)

        # add primary dataset information
        block_dump['primds'] = {
            'primary_ds_type': dataset_type,
            'primary_ds_name': primary_dataset
        }

        # add dataset information
        block_dump['dataset'] = {
            'dataset': dataset,
            'processed_ds_name': processed_dataset,
            'data_tier_name': data_tier,
            'physics_group_name': None,
            'dataset_access_type': 'VALID',
            'xtcrosssection':
            None,  # TODO: Add to metadata from FrameWorkJobReport, if possible!
        }

        # add block information
        site_db = CRIC()
        try:
            origin_site_name = site_db.se_to_cms_name(
                block[DataProvider.Locations][0])[0]
        except IndexError:
            clear_current_exception()
            origin_site_name = 'UNKNOWN'

        block_dump['block'] = {
            'block_name': DataProvider.get_block_id(block),
            'block_size': block_size,
            'file_count': len(block[DataProvider.FileList]),
            'origin_site_name': origin_site_name
        }
        if opts.do_close_blocks:
            block_dump['block']['open_for_writing'] = 0
        else:
            block_dump['block']['open_for_writing'] = 1

        # add acquisition_era, CRAB is important because of checks within DBS 3
        block_dump['acquisition_era'] = {
            'acquisition_era_name': 'CRAB',
            'start_date': 0
        }
        # add processing_era
        block_dump['processing_era'] = {
            'processing_version': 1,
            'description': 'grid-control'
        }

        yield validate_dbs3_json('blockBulk', block_dump)
Exemplo n.º 15
0
 def _displaySetup(self, dsPath, head):
     if os.path.exists(dsPath):
         nickNames = set()
         for block in DataProvider.loadFromFile(dsPath).getBlocks():
             nickNames.add(block[DataProvider.Nickname])
         utils.vprint('Mapping between nickname and other settings:\n', -1)
         report = []
         for nick in sorted(nickNames):
             lumi_filter_str = formatLumi(
                 self._nmLumi.lookup(nick, '', is_selector=False))
             if len(lumi_filter_str) > 4:
                 nice_lumi_filter = '%s ... %s (%d entries)' % (
                     lumi_filter_str[0], lumi_filter_str[-1],
                     len(lumi_filter_str))
             else:
                 nice_lumi_filter = str.join(', ', lumi_filter_str)
             config_files = self._nmCfg.lookup(nick, '', is_selector=False)
             tmp = {
                 0: nick,
                 1: str.join(', ', imap(os.path.basename, config_files)),
                 2: nice_lumi_filter
             }
             lookupvars = {'DATASETNICK': nick}
             for src in self._pm.lookupSources:
                 src.fillParameterInfo(None, lookupvars)
             tmp.update(lookupvars)
             report.append(tmp)
         utils.printTabular(head, report, 'cl')
         utils.vprint(level=-1)
Exemplo n.º 16
0
	def _resync(self):
		if self._data_provider:
			activity = Activity('Performing resync of datasource %r' % self._name)
			# Get old and new dataset information
			ds_old = DataProvider.loadFromFile(self._getDataPath('cache.dat')).getBlocks(show_stats = False)
			self._data_provider.clearCache()
			ds_new = self._data_provider.getBlocks(show_stats = False)
			self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new)

			# Use old splitting information to synchronize with new dataset infos
			old_maxN = self._data_splitter.getMaxJobs()
			jobChanges = self._data_splitter.resyncMapping(self._getDataPath('map-new.tar'), ds_old, ds_new)
			activity.finish()
			if jobChanges is not None:
				# Move current splitting to backup and use the new splitting from now on
				def backupRename(old, cur, new):
					if self._keepOld:
						os.rename(self._getDataPath(cur), self._getDataPath(old))
					os.rename(self._getDataPath(new), self._getDataPath(cur))
				backupRename(  'map-old-%d.tar' % time.time(),   'map.tar',   'map-new.tar')
				backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat')
				self._data_splitter.importPartitions(self._getDataPath('map.tar'))
				self._maxN = self._data_splitter.getMaxJobs()
				self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN)
				return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
Exemplo n.º 17
0
def create_dbs3_proto_blocks(opts, dataset_blocks):
	for dataset in dataset_blocks:
		missing_info_blocks = []
		dataset_types = set()
		for block in dataset_blocks[dataset]:
			block_dump = {'dataset_conf_list': [], 'files': [], 'file_conf_list': [], 'file_parent_list': []}
			(block_size, block_dataset_types) = create_dbs3_json_files(opts, block, block_dump)
			if len(block_dataset_types) > 1:
				raise Exception('Data and MC files are mixed in block %s' % DataProvider.bName(block))
			elif len(block_dataset_types) == 1:
				yield (block, block_dump, block_size, block_dataset_types.pop())
			else:
				missing_info_blocks.append((block, block_dump, block_size))
			dataset_types.update(block_dataset_types) # collect dataset types in this dataset for blocks with missing type information

		if missing_info_blocks:
			if len(dataset_types) > 1:
				raise Exception('Data and MC files are mixed in dataset %s! Unable to determine dataset type for blocks without type info')
			elif len(dataset_types) == 0:
				if not opts.datatype:
					raise Exception('Please supply dataset type via --datatype!')
				dataset_type = opts.datatype
			else:
				dataset_type = dataset_types.pop()
			for (block, block_dump, block_size) in missing_info_blocks:
				yield (block, block_dump, block_size, dataset_type)
Exemplo n.º 18
0
def create_dbs3_proto_blocks(opts, dataset_blocks):
	for dataset in dataset_blocks:
		missing_info_blocks = []
		dataset_types = set()
		for block in dataset_blocks[dataset]:
			block_dump = {'dataset_conf_list': [], 'files': [], 'file_conf_list': [], 'file_parent_list': []}
			(block_size, block_dataset_types) = create_dbs3_json_files(opts, block, block_dump)
			if len(block_dataset_types) > 1:
				raise Exception('Data and MC files are mixed in block %s' % DataProvider.get_block_id(block))
			elif len(block_dataset_types) == 1:
				yield (block, block_dump, block_size, block_dataset_types.pop())
			else:
				missing_info_blocks.append((block, block_dump, block_size))
			# collect dataset types in this dataset for blocks with missing type information
			dataset_types.update(block_dataset_types)

		if missing_info_blocks:
			if len(dataset_types) > 1:
				raise Exception('Data and MC files are mixed in dataset %s! ' +
					'Unable to determine dataset type for blocks without type info')
			elif len(dataset_types) == 0:
				if not opts.datatype:
					raise Exception('Please supply dataset type via --datatype!')
				dataset_type = opts.datatype
			else:
				dataset_type = dataset_types.pop()
			for (block, block_dump, block_size) in missing_info_blocks:
				yield (block, block_dump, block_size, dataset_type)
Exemplo n.º 19
0
	def _resync_psrc(self):
		activity = Activity('Performing resync of datasource %r' % self.get_datasource_name())
		# Get old and new dataset information
		provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat'))
		block_list_old = provider_old.get_block_list_cached(show_stats=False)
		self._provider.clear_cache()
		block_list_new = self._provider.get_block_list_cached(show_stats=False)
		self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new)

		# Use old splitting information to synchronize with new dataset infos
		partition_len_old = self.get_parameter_len()
		partition_changes = self._resync_partitions(
			self._get_data_path('map-new.tar'), block_list_old, block_list_new)
		activity.finish()
		if partition_changes is not None:
			# Move current splitting to backup and use the new splitting from now on
			def _rename_with_backup(new, cur, old):
				if self._keep_old:
					os.rename(self._get_data_path(cur), self._get_data_path(old))
				os.rename(self._get_data_path(new), self._get_data_path(cur))
			_rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time())
			_rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time())
			self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar')))
			self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len)
			(pnum_list_redo, pnum_list_disable) = partition_changes
			return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
Exemplo n.º 20
0
    def _display_setup(self, dataset_fn, head):
        if os.path.exists(dataset_fn):
            nick_name_set = set()
            for block in DataProvider.load_from_file(
                    dataset_fn).get_block_list_cached(show_stats=False):
                nick_name_set.add(block[DataProvider.Nickname])
            self._log.info('Mapping between nickname and other settings:')
            report = []

            def _get_dataset_lookup_psrc(psrc):
                is_lookup_cls = isinstance(
                    psrc,
                    ParameterSource.get_class('LookupBaseParameterSource'))
                return is_lookup_cls and ('DATASETNICK'
                                          in psrc.get_parameter_deps())

            ps_lookup = lfilter(_get_dataset_lookup_psrc,
                                self._source.get_used_psrc_list())
            for nick in sorted(nick_name_set):
                tmp = {'DATASETNICK': nick}
                for src in ps_lookup:
                    src.fill_parameter_content(None, tmp)
                tmp[1] = str.join(
                    ', ',
                    imap(os.path.basename,
                         self._nm_cfg.lookup(nick, '', is_selector=False)))
                tmp[2] = str_lumi_nice(
                    self._nm_lumi.lookup(nick, '', is_selector=False))
                report.append(tmp)
            ConsoleTable.create(head, report, 'cl')
Exemplo n.º 21
0
 def _displaySetup(self, dsPath, head):
     if os.path.exists(dsPath):
         nickNames = set()
         for block in DataProvider.loadFromFile(dsPath).getBlocks():
             nickNames.add(block[DataProvider.Nickname])
         log = logging.getLogger('user')
         log.info('Mapping between nickname and other settings:')
         report = []
         (ps_basic, ps_nested) = self._pfactory.getLookupSources()
         if ps_nested:
             log.info(
                 'This list doesn\'t show "nickname constants" with multiple values!'
             )
         for nick in sorted(nickNames):
             tmp = {'DATASETNICK': nick}
             for src in ps_basic:
                 src.fillParameterInfo(None, tmp)
             tmp[1] = str.join(
                 ', ',
                 imap(os.path.basename,
                      self._nmCfg.lookup(nick, '', is_selector=False)))
             tmp[2] = formatLumiNice(
                 self._nmLumi.lookup(nick, '', is_selector=False))
             report.append(tmp)
         utils.printTabular(head, report, 'cl')
Exemplo n.º 22
0
	def resync(self):
		(result_redo, result_disable, result_sizeChange) = ParameterSource.resync(self)
		if self.resyncEnabled() and self._dataProvider:
			# Get old and new dataset information
			old = DataProvider.loadFromFile(self.getDataPath('cache.dat')).getBlocks()
			self._dataProvider.clearCache()
			new = self._dataProvider.getBlocks()
			self._dataProvider.saveToFile(self.getDataPath('cache-new.dat'), new)

			# Use old splitting information to synchronize with new dataset infos
			jobChanges = self._dataSplitter.resyncMapping(self.getDataPath('map-new.tar'), old, new)
			if jobChanges:
				# Move current splitting to backup and use the new splitting from now on
				def backupRename(old, cur, new):
					if self._keepOld:
						os.rename(self.getDataPath(cur), self.getDataPath(old))
					os.rename(self.getDataPath(new), self.getDataPath(cur))
				backupRename(  'map-old-%d.tar' % time.time(),   'map.tar',   'map-new.tar')
				backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat')
				old_maxN = self._dataSplitter.getMaxJobs()
				self._dataSplitter.importPartitions(self.getDataPath('map.tar'))
				self._maxN = self._dataSplitter.getMaxJobs()
				result_redo.update(jobChanges[0])
				result_disable.update(jobChanges[1])
				result_sizeChange = result_sizeChange or (old_maxN != self._maxN)
			self.resyncFinished()
		return (result_redo, result_disable, result_sizeChange)
Exemplo n.º 23
0
 def getEntries(self, path, metadata, events, seList, objStore):
     datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''),
                                  'datacache.dat')
     source = utils.QM((self._source == '')
                       and os.path.exists(datacachePath), datacachePath,
                       self._source)
     if source and (source not in self._lfnMap):
         pSource = DataProvider.createInstance('ListProvider',
                                               createConfig(), source)
         for (n, fl) in imap(
                 lambda b:
             (b[DataProvider.Dataset], b[DataProvider.FileList]),
                 pSource.getBlocks()):
             self._lfnMap.setdefault(source, {}).update(
                 dict(
                     imap(
                         lambda fi:
                         (self.lfnTrans(fi[DataProvider.URL]), n), fl)))
     pList = set()
     for key in ifilter(lambda k: k in metadata, self._parentKeys):
         pList.update(
             imap(
                 lambda pPath: self._lfnMap.get(source, {}).get(
                     self.lfnTrans(pPath)), metadata[key]))
     metadata['PARENT_PATH'] = lfilter(identity, pList)
     yield (path, metadata, events, seList, objStore)
Exemplo n.º 24
0
	def setupJobParameters(self, config, pm):
		config = config.addSections(['dataset']).addTags([self])
		self.dataSplitter = None
		self.dataRefresh = None
		self.dataset = config.get('dataset', '').strip()
		if self.dataset == '':
			return
		config.set('se output pattern', '@NICK@_job_@MY_JOBID@_@X@', override = False)
		config.set('default lookup', 'DATASETNICK', override = False)

		defaultProvider = config.get('dataset provider', 'ListProvider')
		dataProvider = DataProvider.create(config, self.dataset, defaultProvider)
		splitterName = config.get('dataset splitter', 'FileBoundarySplitter')
		splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName))
		self.dataSplitter = splitterClass(config)
		self.checkSE = config.getBool('dataset storage check', True, onChange = None)

		# Create and register dataset parameter plugin
		paramSource = DataParameterSource(config.getWorkPath(), 'data',
			dataProvider, self.dataSplitter, self.initDataProcessor())
		DataParameterSource.datasetsAvailable['data'] = paramSource

		# Select dataset refresh rate
		self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None)
		if self.dataRefresh > 0:
			paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit()))
			utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1)
		else:
			paramSource.resyncSetup(interval = 0)
		def externalRefresh(sig, frame):
			paramSource.resyncSetup(force = True)
		signal.signal(signal.SIGUSR2, externalRefresh)

		if self.dataSplitter.getMaxJobs() == 0:
			raise UserError('There are no events to process')
Exemplo n.º 25
0
    def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0):
        changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick,
                              datasetID)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=changeTrigger)
        # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-T3_US_FNALLPC',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='weak',
                                              onChange=changeTrigger)
        self._phedexT1Filter = config.getFilter('phedex t1 accept',
                                                'T1_DE_KIT T1_US_FNAL',
                                                defaultMatcher='blackwhite',
                                                defaultFilter='weak',
                                                onChange=changeTrigger)
        self._phedexT1Mode = config.getEnum('phedex t1 mode',
                                            PhedexT1Mode,
                                            PhedexT1Mode.disk,
                                            onChange=changeTrigger)
        self.onlyComplete = config.getBool('only complete sites',
                                           True,
                                           onChange=changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )

        (self._datasetPath, self._url,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        self._url = self._url or config.get('dbs instance', '')
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=changeTrigger)
Exemplo n.º 26
0
    def __init__(self, config, datasetExpr, datasetNick=None):
        self._changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=self._changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=self._changeTrigger)
        config.set('phedex sites matcher mode', 'shell', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-* T1_*_Disk T2_* T3_*',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='strict',
                                              onChange=self._changeTrigger)
        self._onlyComplete = config.getBool('only complete sites',
                                            True,
                                            onChange=self._changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=self._changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        (self._datasetPath, self._datasetInstance,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        instance_default = config.get('dbs instance',
                                      '',
                                      onChange=self._changeTrigger)
        self._datasetInstance = self._datasetInstance or instance_default
        if not self._datasetInstance:
            self._datasetInstance = 'prod/global'
        elif '/' not in self._datasetInstance:
            self._datasetInstance = 'prod/%s' % self._datasetInstance
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=self._changeTrigger)
Exemplo n.º 27
0
	def _read_plfnp_map(self, config, parent_dataset_expr):
		if parent_dataset_expr and (parent_dataset_expr not in self._plfnp2pdn_cache):
			# read parent source and fill lfnMap with parent_lfn_parts -> parent dataset name mapping
			map_plfnp2pdn = self._plfnp2pdn_cache.setdefault(parent_dataset_expr, {})
			for block in DataProvider.iter_blocks_from_expr(self._empty_config, parent_dataset_expr):
				for fi in block[DataProvider.FileList]:
					map_plfnp2pdn[self._get_lfnp(fi[DataProvider.URL])] = block[DataProvider.Dataset]
		return self._plfnp2pdn_cache.get(parent_dataset_expr, {})  # return cached mapping
Exemplo n.º 28
0
    def __init__(self, config, name):
        head = [(0, "Nickname")]

        # Mapping between nickname and config files:
        cfgList = config.get("nickname config", "")
        self.nmCfg = config.getDict(
            "nickname config", {}, parser=lambda x: map(str.strip, x.split(",")), str=lambda x: str.join(",", x)
        )[0]
        if cfgList:
            if "config file" in config.getOptions():
                raise ConfigError("Please use 'nickname config' instead of 'config file'")
            allConfigFiles = utils.flatten(self.nmCfg.values())
            config.set("config file", str.join("\n", allConfigFiles))
            head.append((1, "Config file"))

            # Mapping between nickname and constants:
        self.nmCName = map(str.strip, config.get("nickname constants", "").split())
        self.nmConst = {}
        for var in self.nmCName:
            tmp = config.getDict(var, {})[0]
            for (nick, value) in tmp.items():
                if value:
                    self.nmConst.setdefault(nick, {})[var] = value
                else:
                    self.nmConst.setdefault(nick, {})[var] = ""
            head.append((var, var))

            # Mapping between nickname and lumi filter:
        if "lumi filter" in config.getOptions():
            raise ConfigError("Please use 'nickname lumi filter' instead of 'lumi filter'")
        lumiParse = lambda x: formatLumi(parseLumiFilter(x))
        self.nmLumi = config.getDict("nickname lumi filter", {}, parser=lumiParse)[0]
        if self.nmLumi:
            for dataset in config.get("dataset", "").splitlines():
                (datasetNick, datasetProvider, datasetExpr) = DataProvider.parseDatasetExpr(config, dataset, None)
                config.set(
                    "dataset %s" % datasetNick,
                    "lumi filter",
                    str.join(",", utils.flatten(fromNM(self.nmLumi, datasetNick, []))),
                )
            config.set("lumi filter", str.join(",", self.nmLumi.get(None, [])))
            head.append((2, "Lumi filter"))

        utils.vprint("Mapping between nickname and other settings:\n", -1)

        def report():
            for nick in sorted(set(self.nmCfg.keys() + self.nmConst.keys() + self.nmLumi.keys())):
                tmp = {
                    0: nick,
                    1: str.join(", ", map(os.path.basename, self.nmCfg.get(nick, ""))),
                    2: self.displayLumi(self.nmLumi.get(nick, "")),
                }
                yield utils.mergeDicts([tmp, self.nmConst.get(nick, {})])

        utils.printTabular(head, report(), "cl")
        utils.vprint(level=-1)
        CMSSW.__init__(self, config, name)
Exemplo n.º 29
0
def dataset_show_diff(options):
    if len(options.args) != 2:
        options.parser.exit_with_usage(options.parser.usage('data'))

    provider_a = DataProvider.load_from_file(options.args[0])
    provider_b = DataProvider.load_from_file(options.args[1])
    block_resync_tuple = DataProvider.resync_blocks(
        provider_a.get_block_list_cached(show_stats=False),
        provider_b.get_block_list_cached(show_stats=False))
    (block_list_added, block_list_missing,
     block_list_matching) = block_resync_tuple

    def _dataset_iter_matching_blocks():
        for (block_old, block_new, _, _) in block_list_matching:

            def _format_change(old, new):
                if old != new:
                    return '%s -> %s' % (old, new)
                return old

            block_old[DataProvider.NFiles] = _format_change(
                len(block_old.get(DataProvider.FileList, [])),
                len(block_new.get(DataProvider.FileList, [])))
            block_old[DataProvider.NEntries] = _format_change(
                block_old[DataProvider.NEntries],
                block_new[DataProvider.NEntries])
            yield block_old

    header_list = [(DataProvider.Dataset, 'Dataset'),
                   (DataProvider.BlockName, 'Block'),
                   (DataProvider.NFiles, '#Files'),
                   (DataProvider.NEntries, '#Entries')]
    if block_list_added:
        ConsoleTable.create(header_list,
                            dataset_iter_blocks(block_list_added),
                            title='Added blocks')
    if block_list_missing:
        ConsoleTable.create(header_list,
                            dataset_iter_blocks(block_list_missing),
                            title='Removed blocks')
    if block_list_matching:
        ConsoleTable.create(header_list,
                            _dataset_iter_matching_blocks(),
                            title='Matching blocks')
Exemplo n.º 30
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		self._lumi_filter = parseLumiFilter(config.get('lumi filter', ''))
		if self._lumi_filter:
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		# PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
		self._lumi_query = config.getBool('lumi metadata', self._lumi_filter != [])
		self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak')
		self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak')
		self._phedexT1Mode = config.get('phedex t1 mode', 'disk').lower()
		self.onlyComplete = config.getBool('only complete sites', True)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname)

		(self._datasetPath, self._url, self._datasetBlock) = utils.optSplit(datasetExpr, '@#')
		self._url = self._url or config.get('dbs instance', '')
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True)
Exemplo n.º 31
0
    def __init__(self,
                 dataDir,
                 srcName,
                 dataProvider,
                 dataSplitter,
                 dataProc,
                 repository,
                 keepOld=True):
        LimitedResyncParameterSource.__init__(self)
        (self._dn, self._name, self._data_provider, self._data_splitter, self._part_proc, self._keepOld) = \
         (dataDir, srcName, dataProvider, dataSplitter, dataProc, keepOld)
        repository['dataset:%s' % srcName] = self
        self.resyncSetup(interval=-1)

        if not dataProvider:  # debug mode - used by scripts - disables resync
            self._maxN = self._data_splitter.getMaxJobs()
            return

        # look for aborted resyncs - and try to restore old state if possible
        if self._existsDataPath('cache.dat.resync') and self._existsDataPath(
                'map.tar.resync'):
            utils.renameFile(self._getDataPath('cache.dat.resync'),
                             self._getDataPath('cache.dat'))
            utils.renameFile(self._getDataPath('map.tar.resync'),
                             self._getDataPath('map.tar'))
        elif self._existsDataPath('cache.dat.resync') or self._existsDataPath(
                'map.tar.resync'):
            raise DatasetError('Found broken resync state')

        if self._existsDataPath('cache.dat') and self._existsDataPath(
                'map.tar'):
            self._data_splitter.importPartitions(self._getDataPath('map.tar'))
        else:
            DataProvider.saveToFile(
                self._getDataPath('cache.dat'),
                self._data_provider.getBlocks(show_stats=False))
            self._data_splitter.splitDataset(
                self._getDataPath('map.tar'),
                self._data_provider.getBlocks(show_stats=False))

        self._maxN = self._data_splitter.getMaxJobs()
Exemplo n.º 32
0
	def getEntries(self, path, metadata, events, seList, objStore):
		datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''), 'datacache.dat')
		source = utils.QM((self.source == '') and os.path.exists(datacachePath), datacachePath, self.source)
		if source and (source not in self.lfnMap):
			pSource = DataProvider.create(createConfigFactory().getConfig(), source, 'ListProvider')
			for (n, fl) in map(lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), pSource.getBlocks()):
				self.lfnMap.setdefault(source, {}).update(dict(map(lambda fi: (self.lfnTrans(fi[DataProvider.URL]), n), fl)))
		pList = set()
		for key in filter(lambda k: k in metadata, self.parentKeys):
			pList.update(map(lambda pPath: self.lfnMap.get(source, {}).get(self.lfnTrans(pPath)), metadata[key]))
		metadata['PARENT_PATH'] = filter(lambda x: x, pList)
		yield (path, metadata, events, seList, objStore)
Exemplo n.º 33
0
def dataset_show_removed(options):
	if len(options.args) < 2:
		options.parser.exit_with_usage(options.parser.usage('data'))

	block_list_missing = []
	provider_old = DataProvider.load_from_file(options.args[0])
	for dataset_fn in options.args[1:]:
		provider_new = DataProvider.load_from_file(dataset_fn)
		block_resync_tuple = DataProvider.resync_blocks(
			provider_old.get_block_list_cached(show_stats=False),
			provider_new.get_block_list_cached(show_stats=False))
		for block in block_resync_tuple[1]:  # iterate missing block list
			tmp = dict(block)
			tmp[DataProvider.RemovedIn] = dataset_fn
			block_list_missing.append(tmp)
		provider_old = provider_new
	if block_list_missing:
		ConsoleTable.create([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'),
			(DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries'),
			(DataProvider.RemovedIn, 'Removed in file')],
			dataset_iter_blocks(block_list_missing), title='Removed blocks')
Exemplo n.º 34
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger)
		# PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger)
		self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')

		(self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#')
		self._url = self._url or config.get('dbs instance', '')
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger)
Exemplo n.º 35
0
 def _read_plfnp_map(self, config, parent_dataset_expr):
     if parent_dataset_expr and (parent_dataset_expr
                                 not in self._plfnp2pdn_cache):
         # read parent source and fill lfnMap with parent_lfn_parts -> parent dataset name mapping
         map_plfnp2pdn = self._plfnp2pdn_cache.setdefault(
             parent_dataset_expr, {})
         for block in DataProvider.iter_blocks_from_expr(
                 self._empty_config, parent_dataset_expr):
             for fi in block[DataProvider.FileList]:
                 map_plfnp2pdn[self._get_lfnp(
                     fi[DataProvider.URL])] = block[DataProvider.Dataset]
     return self._plfnp2pdn_cache.get(parent_dataset_expr,
                                      {})  # return cached mapping
Exemplo n.º 36
0
def discover_blocks(options):
	# Get work directory, create dbs dump directory
	if os.path.isdir(options.args[0]):
		workDir = os.path.abspath(os.path.normpath(options.args[0]))
	else:
		workDir = getConfig(configFile = options.args[0]).getWorkPath()
	if not options.opts.tempdir:
		options.opts.tempdir = os.path.join(workDir, 'dbs')
	if not os.path.exists(options.opts.tempdir):
		os.mkdir(options.opts.tempdir)

	# get provider with dataset information
	if options.opts.input_file:
		provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None)
	else:
		config = getConfig(configDict = {'dataset': options.config_dict})
		provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None)

	blocks = provider.getBlocks(show_stats = False)
	DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks)
	if options.opts.discovery:
		sys.exit(os.EX_OK)
	return blocks
Exemplo n.º 37
0
	def _init_reader(self):
		# look for aborted inits / resyncs - and try to restore old state if possible
		if self._exists_data_path('map.tar.resync') and self._exists_data_path('cache.dat.resync'):
			rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat'))
			rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar'))
		elif self._exists_data_path('map.tar.resync') or self._exists_data_path('cache.dat.resync'):
			raise DatasetError('Found broken dataset partition resync state in work directory')

		if self._exists_data_path('map.tar') and not self._exists_data_path('cache.dat'):
			raise DatasetError('Found broken dataset partition in work directory')
		elif not self._exists_data_path('map.tar'):
			# create initial partition map file
			if not self._exists_data_path('cache.dat'):
				provider = self._provider
			else:
				provider = DataProvider.load_from_file(self._get_data_path('cache.dat'))
			block_iter = DataProvider.save_to_file_iter(self._get_data_path('cache.dat.init'),
				provider.get_block_list_cached(show_stats=True))
			partition_iter = self._splitter.split_partitions(block_iter)
			DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter)
			rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat'))
			rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar'))
		return DataSplitter.load_partitions(self._get_data_path('map.tar'))
Exemplo n.º 38
0
    def _init_reader(self):
        # look for aborted inits / resyncs - and try to restore old state if possible
        if self._exists_data_path('map.tar.resync') and self._exists_data_path(
                'cache.dat.resync'):
            rename_file(self._get_data_path('cache.dat.resync'),
                        self._get_data_path('cache.dat'))
            rename_file(self._get_data_path('map.tar.resync'),
                        self._get_data_path('map.tar'))
        elif self._exists_data_path(
                'map.tar.resync') or self._exists_data_path(
                    'cache.dat.resync'):
            raise DatasetError(
                'Found broken dataset partition resync state in work directory'
            )

        if self._exists_data_path(
                'map.tar') and not self._exists_data_path('cache.dat'):
            raise DatasetError(
                'Found broken dataset partition in work directory')
        elif not self._exists_data_path('map.tar'):
            # create initial partition map file
            if not self._exists_data_path('cache.dat'):
                provider = self._provider
            else:
                provider = DataProvider.load_from_file(
                    self._get_data_path('cache.dat'))
            block_iter = DataProvider.save_to_file_iter(
                self._get_data_path('cache.dat.init'),
                provider.get_block_list_cached(show_stats=True))
            partition_iter = self._splitter.split_partitions(block_iter)
            DataSplitter.save_partitions(self._get_data_path('map.tar.init'),
                                         partition_iter)
            rename_file(self._get_data_path('cache.dat.init'),
                        self._get_data_path('cache.dat'))
            rename_file(self._get_data_path('map.tar.init'),
                        self._get_data_path('map.tar'))
        return DataSplitter.load_partitions(self._get_data_path('map.tar'))
Exemplo n.º 39
0
 def _readParents(self, config, source):
     # read parent source and fill lfnMap with parent_lfn_refs -> parent dataset name mapping
     if source and (source not in self._lfnMapCache):
         block_iter = DataProvider.getBlocksFromExpr(config, source)
         for (dsName, fl) in imap(
                 lambda b:
             (b[DataProvider.Dataset], b[DataProvider.FileList]),
                 block_iter):
             self._lfnMapCache.setdefault(source, {}).update(
                 dict(
                     imap(
                         lambda fi:
                         (self._lfnTrans(fi[DataProvider.URL]), dsName),
                         fl)))
     return self._lfnMapCache.get(source, {})
Exemplo n.º 40
0
def list_config_entries(dataset_list, block_list, opts, provider):
	dataset_config_str_list = []
	max_nick_len = 0
	for ds_info in merge_blocks(block_list):
		max_nick_len = max(max_nick_len, len(ds_info.get(DataProvider.Nickname, '')))
	for ds_info in merge_blocks(block_list):
		nickname = ds_info.get(DataProvider.Nickname, '').rjust(max_nick_len)
		provider_name_list = DataProvider.get_class(ds_info[DataProvider.Provider]).get_class_name_list()
		provider_name = sorted(provider_name_list, key=len)[0]
		query = ds_info[DataProvider.Query]
		if provider_name == 'list':
			query += ' %% %s' % ds_info[DataProvider.Dataset]
		dataset_config_str = '\t%s : %s : %s' % (nickname, provider_name, query)
		dataset_config_str_list.append(dataset_config_str)
	logging.getLogger('script').info('\ndataset =\n' + str.join('\n', dataset_config_str_list) + '\n')
Exemplo n.º 41
0
	def __init__(self, config, datasetExpr, datasetNick, datasetID = 0):
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		# PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
		self.phedexBL = config.getList('phedex sites', ['-T3_US_FNALLPC'])
		self.phedexWL = config.getList('phedex t1 accept', ['T1_DE_KIT', 'T1_US_FNAL'])
		self.phedexT1 = config.get('phedex t1 mode', 'disk').lower()
		self.onlyComplete = config.getBool('only complete sites', True)
		self.locationFormat = config.get('location format', 'hostname').lower() # hostname or sitedb
		if self.locationFormat not in ['hostname', 'sitedb', 'both']:
			raise ConfigError('Invalid location format: %s' % self.locationFormat)

		(self.datasetPath, self.url, self.datasetBlock) = utils.optSplit(datasetExpr, '@#')
		self.url = QM(self.url, self.url, config.get('dbs instance', ''))
		self.datasetBlock = QM(self.datasetBlock, self.datasetBlock, 'all')
		self.includeLumi = config.getBool('keep lumi metadata', False)
		self.onlyValid = config.getBool('only valid', True)
		self.checkUnique = config.getBool('check unique', True)

		# This works in tandem with active task module (cmssy.py supports only [section] lumi filter!)
		self.selectedLumis = parseLumiFilter(config.get('lumi filter', ''))
		if self.selectedLumis:
			utils.vprint('Runs/lumi section filter enabled! (%d entries)' % len(self.selectedLumis), -1, once = True)
			utils.vprint('\tThe following runs and lumi sections are selected:', 1, once = True)
			utils.vprint('\t' + utils.wrapList(formatLumi(self.selectedLumis), 65, ',\n\t'), 1, once = True)
Exemplo n.º 42
0
def dataset_show_removed(options):
    if len(options.args) < 2:
        options.parser.exit_with_usage(options.parser.usage('data'))

    block_list_missing = []
    provider_old = DataProvider.load_from_file(options.args[0])
    for dataset_fn in options.args[1:]:
        provider_new = DataProvider.load_from_file(dataset_fn)
        block_resync_tuple = DataProvider.resync_blocks(
            provider_old.get_block_list_cached(show_stats=False),
            provider_new.get_block_list_cached(show_stats=False))
        for block in block_resync_tuple[1]:  # iterate missing block list
            tmp = dict(block)
            tmp[DataProvider.RemovedIn] = dataset_fn
            block_list_missing.append(tmp)
        provider_old = provider_new
    if block_list_missing:
        ConsoleTable.create([(DataProvider.Dataset, 'Dataset'),
                             (DataProvider.BlockName, 'Block'),
                             (DataProvider.NFiles, '#Files'),
                             (DataProvider.NEntries, '#Entries'),
                             (DataProvider.RemovedIn, 'Removed in file')],
                            dataset_iter_blocks(block_list_missing),
                            title='Removed blocks')
Exemplo n.º 43
0
    def __init__(self,
                 dataDir,
                 srcName,
                 dataProvider,
                 dataSplitter,
                 dataProc,
                 keepOld=True):
        ParameterSource.__init__(self)
        (self._dataDir, self._srcName, self._dataProvider, self._dataSplitter, self._part_proc) = \
         (dataDir, srcName, dataProvider, dataSplitter, dataProc)

        if not dataProvider:
            pass  # debug mode - used by scripts - disables resync
        elif os.path.exists(
                self.getDataPath('cache.dat') and self.getDataPath('map.tar')):
            self._dataSplitter.importPartitions(self.getDataPath('map.tar'))
        else:
            DataProvider.saveToFile(self.getDataPath('cache.dat'),
                                    self._dataProvider.getBlocks(silent=False))
            self._dataSplitter.splitDataset(self.getDataPath('map.tar'),
                                            self._dataProvider.getBlocks())

        self._maxN = self._dataSplitter.getMaxJobs()
        self._keepOld = keepOld
Exemplo n.º 44
0
	def setupJobParameters(self, config, pm):
		config = config.changeView(viewClass = TaggedConfigView, addSections = ['dataset'], addTags = [self])
		self.dataSplitter = None
		self.dataRefresh = None
		self._forceRefresh = config.getState('resync', detail = 'dataset', default = False)
		def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str):
			if ((old_obj == '') and (cur_obj != '')):
				raise UserError('It is currently not possible to attach a dataset to a non-dataset task!')
			self._forceRefresh = True
			return cur_obj
		self.dataset = config.get('dataset', '', onChange = userRefresh).strip()
		if self.dataset == '':
			return
		config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@')
		config.set('default lookup', 'DATASETNICK')

		defaultProvider = config.get('dataset provider', 'ListProvider')
		dataProvider = DataProvider.create(config, self.dataset, defaultProvider)
		splitterName = config.get('dataset splitter', 'FileBoundarySplitter')
		splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName))
		self.dataSplitter = splitterClass(config)

		# Create and register dataset parameter source
		paramSplitProcessor = config.getCompositePlugin('dataset processor',
			'BasicDataSplitProcessor SECheckSplitProcessor', 'MultiDataSplitProcessor',
			cls = DataSplitProcessor).getInstance(config)
		paramSource = DataParameterSource(config.getWorkPath(), 'data',
			dataProvider, self.dataSplitter, paramSplitProcessor)
		DataParameterSource.datasetsAvailable['data'] = paramSource

		# Select dataset refresh rate
		self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None)
		if self.dataRefresh > 0:
			paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit()))
			utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1)
		else:
			paramSource.resyncSetup(interval = 0)
		if self._forceRefresh:
			paramSource.resyncSetup(force = True)
		def externalRefresh(sig, frame):
			paramSource.resyncSetup(force = True)
		signal.signal(signal.SIGUSR2, externalRefresh)

		if self.dataSplitter.getMaxJobs() == 0:
			raise UserError('There are no events to process')
Exemplo n.º 45
0
	def _displaySetup(self, dsPath, head):
		if os.path.exists(dsPath):
			nickNames = set()
			for block in DataProvider.loadFromFile(dsPath).getBlocks(show_stats = False):
				nickNames.add(block[DataProvider.Nickname])
			log = logging.getLogger('user')
			log.info('Mapping between nickname and other settings:')
			report = []
			(ps_basic, ps_nested) = self._pfactory.getLookupSources()
			if ps_nested:
				log.info('This list doesn\'t show "nickname constants" with multiple values!')
			for nick in sorted(nickNames):
				tmp = {'DATASETNICK': nick}
				for src in ps_basic:
					src.fillParameterInfo(None, tmp)
				tmp[1] = str.join(', ', imap(os.path.basename, self._nmCfg.lookup(nick, '', is_selector = False)))
				tmp[2] = formatLumiNice(self._nmLumi.lookup(nick, '', is_selector = False))
				report.append(tmp)
			utils.printTabular(head, report, 'cl')
Exemplo n.º 46
0
def list_config_entries(dataset_list, block_list, opts, provider):
    dataset_config_str_list = []
    max_nick_len = 0
    for ds_info in merge_blocks(block_list):
        max_nick_len = max(max_nick_len,
                           len(ds_info.get(DataProvider.Nickname, '')))
    for ds_info in merge_blocks(block_list):
        nickname = ds_info.get(DataProvider.Nickname, '').rjust(max_nick_len)
        provider_name_list = DataProvider.get_class(
            ds_info[DataProvider.Provider]).get_class_name_list()
        provider_name = sorted(provider_name_list, key=len)[0]
        query = ds_info[DataProvider.Query]
        if provider_name == 'list':
            query += ' %% %s' % ds_info[DataProvider.Dataset]
        dataset_config_str = '\t%s : %s : %s' % (nickname, provider_name,
                                                 query)
        dataset_config_str_list.append(dataset_config_str)
    logging.getLogger('script').info('\ndataset =\n' +
                                     str.join('\n', dataset_config_str_list) +
                                     '\n')
Exemplo n.º 47
0
def create_dbs3_json_blocks(opts, dataset_blocks):
	dbs3_proto_block_iter = create_dbs3_proto_blocks(opts, dataset_blocks)
	for (block, block_dump, block_size, dataset_type) in dbs3_proto_block_iter:
		dataset = block[DataProvider.Dataset]
		try:
			primary_dataset, processed_dataset, data_tier = dataset[1:].split('/')
		except Exception:
			raise DatasetError('Dataset name %s is not a valid DBS name!' % dataset)

		# add primary dataset information
		block_dump['primds'] = {'primary_ds_type': dataset_type, 'primary_ds_name': primary_dataset}

		# add dataset information
		block_dump['dataset'] = {
			'dataset': dataset, 'processed_ds_name': processed_dataset, 'data_tier_name': data_tier,
			'physics_group_name': None, 'dataset_access_type': 'VALID',
			'xtcrosssection': None,  # TODO: Add to metadata from FrameWorkJobReport, if possible!
		}

		# add block information
		site_db = SiteDB()
		try:
			origin_site_name = site_db.se_to_cms_name(block[DataProvider.Locations][0])[0]
		except IndexError:
			clear_current_exception()
			origin_site_name = 'UNKNOWN'

		block_dump['block'] = {'block_name': DataProvider.get_block_id(block), 'block_size': block_size,
			'file_count': len(block[DataProvider.FileList]), 'origin_site_name': origin_site_name}
		if opts.do_close_blocks:
			block_dump['block']['open_for_writing'] = 0
		else:
			block_dump['block']['open_for_writing'] = 1

		# add acquisition_era, CRAB is important because of checks within DBS 3
		block_dump['acquisition_era'] = {'acquisition_era_name': 'CRAB', 'start_date': 0}
		# add processing_era
		block_dump['processing_era'] = {'processing_version': 1, 'description': 'grid-control'}

		yield validate_dbs3_json('blockBulk', block_dump)
Exemplo n.º 48
0
	def _displaySetup(self, dsPath, head):
		if os.path.exists(dsPath):
			nickNames = set()
			for block in DataProvider.loadFromFile(dsPath).getBlocks():
				nickNames.add(block[DataProvider.Nickname])
			utils.vprint('Mapping between nickname and other settings:\n', -1)
			report = []
			for nick in sorted(nickNames):
				lumi_filter_str = formatLumi(self._nmLumi.lookup(nick, '', is_selector = False))
				if len(lumi_filter_str) > 4:
					nice_lumi_filter = '%s ... %s (%d entries)' % (lumi_filter_str[0], lumi_filter_str[-1], len(lumi_filter_str))
				else:
					nice_lumi_filter = str.join(', ', lumi_filter_str)
				config_files = self._nmCfg.lookup(nick, '', is_selector = False)
				tmp = {0: nick, 1: str.join(', ', imap(os.path.basename, config_files)), 2: nice_lumi_filter}
				lookupvars = {'DATASETNICK': nick}
				for src in self._pm.lookupSources:
					src.fillParameterInfo(None, lookupvars)
				tmp.update(lookupvars)
				report.append(tmp)
			utils.printTabular(head, report, 'cl')
			utils.vprint(level = -1)
Exemplo n.º 49
0
    def _resync(self):
        if self._data_provider:
            activity = Activity('Performing resync of datasource %r' %
                                self._name)
            # Get old and new dataset information
            ds_old = DataProvider.loadFromFile(
                self._getDataPath('cache.dat')).getBlocks(show_stats=False)
            self._data_provider.clearCache()
            ds_new = self._data_provider.getBlocks(show_stats=False)
            self._data_provider.saveToFile(self._getDataPath('cache-new.dat'),
                                           ds_new)

            # Use old splitting information to synchronize with new dataset infos
            old_maxN = self._data_splitter.getMaxJobs()
            jobChanges = self._data_splitter.resyncMapping(
                self._getDataPath('map-new.tar'), ds_old, ds_new)
            activity.finish()
            if jobChanges is not None:
                # Move current splitting to backup and use the new splitting from now on
                def backupRename(old, cur, new):
                    if self._keepOld:
                        os.rename(self._getDataPath(cur),
                                  self._getDataPath(old))
                    os.rename(self._getDataPath(new), self._getDataPath(cur))

                backupRename('map-old-%d.tar' % time.time(), 'map.tar',
                             'map-new.tar')
                backupRename('cache-old-%d.dat' % time.time(), 'cache.dat',
                             'cache-new.dat')
                self._data_splitter.importPartitions(
                    self._getDataPath('map.tar'))
                self._maxN = self._data_splitter.getMaxJobs()
                self._log.debug('Dataset resync finished: %d -> %d partitions',
                                old_maxN, self._maxN)
                return (set(jobChanges[0]), set(jobChanges[1]),
                        old_maxN != self._maxN)
Exemplo n.º 50
0
    def resync(self):
        (result_redo, result_disable,
         result_sizeChange) = ParameterSource.resync(self)
        if self.resyncEnabled() and self._dataProvider:
            # Get old and new dataset information
            old = DataProvider.loadFromFile(
                self.getDataPath('cache.dat')).getBlocks()
            self._dataProvider.clearCache()
            new = self._dataProvider.getBlocks()
            self._dataProvider.saveToFile(self.getDataPath('cache-new.dat'),
                                          new)

            # Use old splitting information to synchronize with new dataset infos
            jobChanges = self._dataSplitter.resyncMapping(
                self.getDataPath('map-new.tar'), old, new)
            if jobChanges:
                # Move current splitting to backup and use the new splitting from now on
                def backupRename(old, cur, new):
                    if self._keepOld:
                        os.rename(self.getDataPath(cur), self.getDataPath(old))
                    os.rename(self.getDataPath(new), self.getDataPath(cur))

                backupRename('map-old-%d.tar' % time.time(), 'map.tar',
                             'map-new.tar')
                backupRename('cache-old-%d.dat' % time.time(), 'cache.dat',
                             'cache-new.dat')
                old_maxN = self._dataSplitter.getMaxJobs()
                self._dataSplitter.importPartitions(
                    self.getDataPath('map.tar'))
                self._maxN = self._dataSplitter.getMaxJobs()
                result_redo.update(jobChanges[0])
                result_disable.update(jobChanges[1])
                result_sizeChange = result_sizeChange or (old_maxN !=
                                                          self._maxN)
            self.resyncFinished()
        return (result_redo, result_disable, result_sizeChange)
Exemplo n.º 51
0
    def _resync_psrc(self):
        activity = Activity('Performing resync of datasource %r' %
                            self.get_datasource_name())
        # Get old and new dataset information
        provider_old = DataProvider.load_from_file(
            self._get_data_path('cache.dat'))
        block_list_old = provider_old.get_block_list_cached(show_stats=False)
        self._provider.clear_cache()
        block_list_new = self._provider.get_block_list_cached(show_stats=False)
        self._provider.save_to_file(self._get_data_path('cache-new.dat'),
                                    block_list_new)

        # Use old splitting information to synchronize with new dataset infos
        partition_len_old = self.get_parameter_len()
        partition_changes = self._resync_partitions(
            self._get_data_path('map-new.tar'), block_list_old, block_list_new)
        activity.finish()
        if partition_changes is not None:
            # Move current splitting to backup and use the new splitting from now on
            def _rename_with_backup(new, cur, old):
                if self._keep_old:
                    os.rename(self._get_data_path(cur),
                              self._get_data_path(old))
                os.rename(self._get_data_path(new), self._get_data_path(cur))

            _rename_with_backup('map-new.tar', 'map.tar',
                                'map-old-%d.tar' % time.time())
            _rename_with_backup('cache-new.dat', 'cache.dat',
                                'cache-old-%d.dat' % time.time())
            self._set_reader(
                DataSplitter.load_partitions(self._get_data_path('map.tar')))
            self._log.debug('Dataset resync finished: %d -> %d partitions',
                            partition_len_old, self._len)
            (pnum_list_redo, pnum_list_disable) = partition_changes
            return (set(pnum_list_redo), set(pnum_list_disable),
                    partition_len_old != self._len)
Exemplo n.º 52
0
def save_dataset(fn, block_list):
    DataProvider.save_to_file(fn, block_list)
    logging.getLogger('script').info('Dataset information saved to %r', fn)
Exemplo n.º 53
0
	selected = JobSelector.create(opts.job_selector, task = task)
	logging.info('Matching jobs: ' + str.join(' ', imap(str, jobDB.getJobsIter(selected))))
	if opts.job_reset_attempts:
		jobs_reset_attempts(jobDB, selected)
	if opts.job_force_state:
		jobs_force_state(opts, jobDB, selected)
	if opts.job_show_jdl:
		jobs_show_jdl(jobDB, selected)

########################################################
# DATASET INFOS

if opts.dataset_show_diff:
	if len(args) != 2:
		utils.exitWithUsage('%s <dataset source 1> <dataset source 2>' % sys.argv[0])
	a = DataProvider.createInstance('ListProvider', config, args[0], None)
	b = DataProvider.createInstance('ListProvider', config, args[1], None)
	(blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(a.getBlocks(show_stats = False), b.getBlocks(show_stats = False))
	utils.printTabular([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block')], blocksMissing)

if opts.dataset_show_removed:
	if len(args) < 2:
		utils.exitWithUsage('%s <dataset source 1> <dataset source 2> ... <dataset source N> ' % sys.argv[0])
	removed = []
	oldDP = DataProvider.createInstance('ListProvider', config, args[0], None)
	for new in args[1:]:
		newDP = DataProvider.createInstance('ListProvider', config, new, None)
		(blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldDP.getBlocks(show_stats = False), newDP.getBlocks(show_stats = False))
		for block in blocksMissing:
			tmp = dict(block)
			tmp[-1] = new
Exemplo n.º 54
0
def save_dataset(fn, block_list):
	DataProvider.save_to_file(fn, block_list)
	logging.getLogger('script').info('Dataset information saved to %r', fn)
Exemplo n.º 55
0
 def __init__(self, config):
     InfoScanner.__init__(self, config)
     dsPath = config.get('source dataset path')
     self._source = DataProvider.createInstance('ListProvider', config,
                                                dsPath)
Exemplo n.º 56
0
 def _get_fi_class(self, fi, block):
     run_range = self._run_range.lookup(DataProvider.get_block_id(block))
     metadata_idx = block[DataProvider.Metadata].index('Runs')
     return tuple(
         imap(lambda r: int(r / run_range),
              fi[DataProvider.Metadata][metadata_idx]))
Exemplo n.º 57
0
	def __init__(self, config):
		dsPath = config.get('source dataset path')
		self.source = DataProvider.create(config, None, dsPath, 'ListProvider')
Exemplo n.º 58
0
	def processBlock(self, block):
		if self._lumi_filter.empty() and ((self._lumi_keep == LumiKeep.RunLumi) or (DataProvider.Metadata not in block)):
			return block
		def getMetadataIdx(key):
			if key in block.get(DataProvider.Metadata, []):
				return block[DataProvider.Metadata].index(key)
		idxRuns = getMetadataIdx('Runs')
		idxLumi = getMetadataIdx('Lumi')
		if not self._lumi_filter.empty():
			lumi_filter = self._lumi_filter.lookup(block[DataProvider.Nickname], is_selector = False)
			if lumi_filter and (self._lumi_strict == LumiMode.strict) and ((idxRuns is None) or (idxLumi is None)):
				raise DatasetError('Strict lumi filter active but dataset %s does not provide lumi information!' % DataProvider.bName(block))
			elif lumi_filter and (self._lumi_strict == LumiMode.weak) and (idxRuns is None):
				raise DatasetError('Weak lumi filter active but dataset %s does not provide run information!' % DataProvider.bName(block))

		block[DataProvider.FileList] = list(self._processFI(block, idxRuns, idxLumi))
		if not block[DataProvider.FileList]:
			return
		block[DataProvider.NEntries] = sum(imap(lambda fi: fi[DataProvider.NEntries], block[DataProvider.FileList]))
		# Prune metadata
		if self._lumi_keep == LumiKeep.RunLumi:
			return block
		elif self._lumi_keep == LumiKeep.Run:
			idxRuns = None
		removeRunLumi(block[DataProvider.Metadata], idxRuns, idxLumi)
		return block