예제 #1
0
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick, provider_list):
		for provider in provider_list:
			provider.disable_stream_singletons()
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick)
		self._stats = DataProcessor.create_instance('SimpleStatsDataProcessor', config,
			'dataset', self._log, 'Summary: Running over ')
		self._provider_list = provider_list
예제 #2
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)

		(self._path, self._events, selist) = utils.optSplit(datasetExpr, '|@')
		self._selist = parseList(selist, ',') or None
		if not (self._path and self._events):
			raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
예제 #3
0
 def __init__(self, config, datasetExpr, datasetNick, datasetID,
              providerList):
     DataProvider.__init__(self, config, datasetExpr, datasetNick,
                           datasetID)
     self._providerList = providerList
     for provider in self._providerList:
         provider.setPassthrough()
예제 #4
0
	def __init__(self, config, datasource_name, dataset_expr,
			dataset_nick, dataset_proc, scanner_list_default):
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		# Configure scanners
		scanner_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters']))
		self._interactive_assignment = config.is_interactive('dataset name assignment', True)

		def _create_scanner(scanner_name):
			return InfoScanner.create_instance(scanner_name, scanner_config, datasource_name)
		scanner_list = scanner_config.get_list('scanner', scanner_list_default) + ['NullScanner']
		self._scanner_list = lmap(_create_scanner, scanner_list)

		# Configure dataset / block naming and selection
		def _setup(prefix):
			selected_hash_list = scanner_config.get_list(join_config_locations(prefix, 'key select'), [])
			name = scanner_config.get(join_config_locations(prefix, 'name pattern'), '')
			return (selected_hash_list, name)
		(self._selected_hash_list_dataset, self._dataset_pattern) = _setup('dataset')
		(self._selected_hash_list_block, self._block_pattern) = _setup('block')

		# Configure hash input for separation of files into datasets / blocks
		def _get_active_hash_input(prefix, guard_entry_idx):
			hash_input_list_user = scanner_config.get_list(join_config_locations(prefix, 'hash keys'), [])
			hash_input_list_guard = scanner_config.get_list(join_config_locations(prefix, 'guard override'),
				lchain(imap(lambda scanner: scanner.get_guard_keysets()[guard_entry_idx], self._scanner_list)))
			return hash_input_list_user + hash_input_list_guard
		self._hash_input_set_dataset = _get_active_hash_input('dataset', 0)
		self._hash_input_set_block = _get_active_hash_input('block', 1)
예제 #5
0
	def __init__(self, config, datasetExpr, datasetNick = None):
		DataProvider.__init__(self, config, datasetExpr, datasetNick)

		(self._path, self._events, selist) = utils.optSplit(datasetExpr, '|@')
		self._selist = parseList(selist, ',') or None
		if not (self._path and self._events):
			raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
예제 #6
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
        self._common_prefix = max(DataProvider.enum_value_list) + 1
        self._common_metadata = max(DataProvider.enum_value_list) + 2

        self._entry_handler_info = {
            'events': (DataProvider.NEntries, int, 'block entry counter'),
            'id': (None, None, 'dataset ID'),  # legacy key - skip
            'metadata':
            (DataProvider.Metadata, parse_json, 'metadata description'),
            'metadata common':
            (self._common_metadata, parse_json, 'common metadata'),
            'nickname': (DataProvider.Nickname, str, 'dataset nickname'),
            'prefix': (self._common_prefix, str, 'common prefix'),
            'se list':
            (DataProvider.Locations, lambda value: parse_list(value, ','),
             'block location'),
        }

        (path, self._forced_prefix,
         self._filter) = split_opt(dataset_expr, '@%')
        self._filename = config.resolve_path(
            path, True, 'Error resolving dataset file: %s' % path)
예제 #7
0
    def __init__(self, config, datasetExpr, datasetNick, datasetID=0):
        DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)

        (self._path, self._events, selist) = utils.optSplit(datasetExpr, "|@")
        self._selist = utils.parseList(selist, delimeter=",", onEmpty=None)
        if not (self._path and self._events):
            raise ConfigError("Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]")
예제 #8
0
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)

		(self._path, self._events, selist) = split_opt(dataset_expr, '|@')
		self._selist = parse_list(selist, ',') or None
		if not (self._path and self._events):
			raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
예제 #9
0
 def __init__(self, config, datasetExpr, datasetNick, sList):
     DataProvider.__init__(self, config, datasetExpr, datasetNick)
     (self._ds_select, self._ds_name, self._ds_keys_user,
      self._ds_keys_guard) = self._setup(config, 'dataset')
     (self._b_select, self._b_name, self._b_keys_user,
      self._b_keys_guard) = self._setup(config, 'block')
     scanList = config.getList('scanner', sList) + ['NullScanner']
     self._scanner = lmap(
         lambda cls: InfoScanner.createInstance(cls, config), scanList)
예제 #10
0
	def processBlock(self, block):
		if block[DataProvider.Locations] is not None:
			sites = self._locationfilter.filterList(block[DataProvider.Locations])
			if (sites is not None) and (len(sites) == 0) and (len(block[DataProvider.FileList]) != 0):
				if not len(block[DataProvider.Locations]):
					self._log.warning('Block %s is not available at any site!', DataProvider.bName(block))
				elif not len(sites):
					self._log.warning('Block %s is not available at any selected site!', DataProvider.bName(block))
			block[DataProvider.Locations] = sites
		return block
예제 #11
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		DataProvider.__init__(self, config, '', datasetNick, datasetID)
		def DSB(cFun, n, *args, **kargs):
			return (cFun('dataset %s' % n, *args, **kargs), cFun('block %s' % n, *args, **kargs))
		(self.nameDS, self.nameB) = DSB(config.get, 'name pattern', '')
		(self.kUserDS, self.kUserB) = DSB(config.getList, 'hash keys', [])
		(self.kGuardDS, self.kGuardB) = DSB(config.getList, 'guard override', [])
		self.kSelectDS = config.getList('dataset key select', [])
		scanList = config.getList('scanner', datasetExpr) + ['NullScanner']
		self.scanner = lmap(lambda cls: InfoScanner.createInstance(cls, config), scanList)
예제 #12
0
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)

		ds_config = config.change_view(view_class='SimpleConfigView',
			set_sections=['datasource %s' % dataset_expr])
		self._block = self._read_block(ds_config, dataset_expr, dataset_nick)

		def _on_change(config, old_obj, cur_obj, cur_entry, obj2str):
			self._log.critical('Dataset %r changed', dataset_expr)
			return TriggerResync(['datasets', 'parameters'])(config, old_obj, cur_obj, cur_entry, obj2str)
		ds_config.get('dataset hash', self._get_dataset_hash(), persistent=True, on_change=_on_change)
예제 #13
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)

		config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['datasource %s' % datasetExpr])
		self._block = self._readBlockFromConfig(config, datasetExpr, datasetNick, datasetID)

		dataset_hash_new = md5_hex(repr(self._block))
		dataset_hash_old = config.get('dataset hash', dataset_hash_new, persistent = True)
		self._request_resync = dataset_hash_new != dataset_hash_old
		if self._request_resync:
			self._log.critical('Dataset %r changed', datasetExpr)
			config.setState(True, 'resync', detail = 'dataset')
			config.setState(True, 'resync', detail = 'parameters')
			config.set('dataset hash', dataset_hash_new)
예제 #14
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)

        (self._path, self._events, selist) = split_opt(dataset_expr, '|@')
        self._selist = parse_list(selist, ',') or None
        if not (self._path and self._events):
            raise ConfigError(
                'Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]'
            )
예제 #15
0
 def processBlock(self, block):
     if block[DataProvider.Locations] is not None:
         sites = self._locationfilter.filterList(
             block[DataProvider.Locations])
         if (sites is not None) and (len(sites) == 0) and (len(
                 block[DataProvider.FileList]) != 0):
             if not len(block[DataProvider.Locations]):
                 self._log.warning('Block %s is not available at any site!',
                                   DataProvider.bName(block))
             elif not len(sites):
                 self._log.warning(
                     'Block %s is not available at any selected site!',
                     DataProvider.bName(block))
         block[DataProvider.Locations] = sites
     return block
예제 #16
0
	def processBlock(self, block):
		# Check entry consistency
		events = sum(imap(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList]))
		if block.setdefault(DataProvider.NEntries, events) != events:
			self._handleError('Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' % (
				DataProvider.bName(block), block[DataProvider.NEntries], events), self._mode)
		return block
예제 #17
0
    def resyncMapping(self, newSplitPath, oldBlocks, newBlocks):
        log = utils.ActivityLog('Performing resynchronization of dataset')
        (blocksAdded, blocksMissing,
         blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks)
        for rmBlock in blocksMissing:  # Files in matching blocks are already sorted
            sort_inplace(rmBlock[DataProvider.FileList],
                         key=lambda x: x[DataProvider.URL])
        log.finish()

        # User overview and setup starts here
        resultRedo = []
        resultDisable = []
        newSplitPathTMP = newSplitPath + '.tmp'
        resyncIter = self._resyncIterator(resultRedo, resultDisable,
                                          blocksAdded, blocksMissing,
                                          blocksMatching)
        self.savePartitions(
            newSplitPathTMP,
            resyncIter,
            sourceLen=self.getMaxJobs(),
            message=
            'Performing resynchronization of dataset map (progress is estimated)'
        )

        if self._interactive:
            # TODO: print info and ask
            if not utils.getUserBool(
                    'Do you want to use the new dataset partition?', False):
                return None
        os.rename(newSplitPathTMP, newSplitPath)

        return (resultRedo, resultDisable)
예제 #18
0
	def __init__(self, block_list_old, block_list_new):
		activity = Activity('Performing resynchronization of dataset')
		block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new)
		(self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple
		for block_missing in self._block_list_missing:  # Files in matching blocks are already sorted
			sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL))
		activity.finish()
예제 #19
0
	def _read_block(self, ds_config, dataset_expr, dataset_nick):
		metadata_name_list = parse_json(ds_config.get('metadata', '[]', on_change=None))
		common_metadata = parse_json(ds_config.get('metadata common', '[]', on_change=None))
		if len(common_metadata) > len(metadata_name_list):
			raise DatasetError('Unable to set %d common metadata items ' % len(common_metadata) +
				'with %d metadata keys' % len(metadata_name_list))
		common_prefix = ds_config.get('prefix', '', on_change=None)
		fn_list = []
		has_events = False
		has_se_list = False
		for url in ds_config.get_option_list():
			if url == 'se list':
				has_se_list = True
			elif url == 'events':
				has_events = True
			elif url not in ['dataset hash', 'metadata', 'metadata common', 'nickname', 'prefix']:
				fi = self._read_fi(ds_config, url, metadata_name_list, common_metadata, common_prefix)
				fn_list.append(fi)
		if not fn_list:
			raise DatasetError('There are no dataset files specified for dataset %r' % dataset_expr)

		result = {
			DataProvider.Nickname: ds_config.get('nickname', dataset_nick or '', on_change=None),
			DataProvider.FileList: sorted(fn_list, key=lambda fi: fi[DataProvider.URL])
		}
		result.update(DataProvider.parse_block_id(dataset_expr))
		if metadata_name_list:
			result[DataProvider.Metadata] = metadata_name_list
		if has_events:
			result[DataProvider.NEntries] = ds_config.get_int('events', -1, on_change=None)
		if has_se_list:
			result[DataProvider.Locations] = parse_list(ds_config.get('se list', '', on_change=None), ',')
		return result
예제 #20
0
    def process_block(self, block):
        # Check uniqueness of URLs
        url_hash_list = []
        if self._check_url != DatasetUniqueMode.ignore:
            block[DataProvider.FileList] = list(
                self._process_fi_list(url_hash_list,
                                      block[DataProvider.FileList]))
            url_hash_list.sort()

        # Check uniqueness of blocks
        if self._check_block != DatasetUniqueMode.ignore:
            block_hash = md5_hex(
                repr((block.get(DataProvider.Dataset),
                      block[DataProvider.BlockName], url_hash_list,
                      block[DataProvider.NEntries],
                      block[DataProvider.Locations],
                      block.get(DataProvider.Metadata))))
            if block_hash in self._recorded_block:
                msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(
                    block)
                msg += ' (This check can be configured with %r)' % 'dataset check unique block'
                if self._check_block == DatasetUniqueMode.warn:
                    self._log.warning(msg)
                elif self._check_block == DatasetUniqueMode.abort:
                    raise DatasetError(msg)
                elif self._check_block == DatasetUniqueMode.skip:
                    return None
            self._recorded_block.add(block_hash)
        return block
예제 #21
0
	def split_partitions(self, block_iter, entry_first=0):
		for block in block_iter:
			entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block))
			for proto_partition in self._partition_block(block[DataProvider.FileList],
					entries_per_job, entry_first):
				entry_first = 0
				yield self._finish_partition(block, proto_partition)
예제 #22
0
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		self._common_prefix = max(DataProvider.enum_value_list) + 1
		self._common_metadata = max(DataProvider.enum_value_list) + 2

		self._entry_handler_info = {
			'events': (DataProvider.NEntries, int, 'block entry counter'),
			'id': (None, None, 'dataset ID'),  # legacy key - skip
			'metadata': (DataProvider.Metadata, parse_json, 'metadata description'),
			'metadata common': (self._common_metadata, parse_json, 'common metadata'),
			'nickname': (DataProvider.Nickname, str, 'dataset nickname'),
			'prefix': (self._common_prefix, str, 'common prefix'),
			'se list': (DataProvider.Locations, lambda value: parse_list(value, ','), 'block location'),
		}

		(path, self._forced_prefix, self._filter) = split_opt(dataset_expr, '@%')
		self._filename = config.resolve_path(path, True, 'Error resolving dataset file: %s' % path)
예제 #23
0
	def __init__(self, config, datasetExpr, datasetNick = None):
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		self._CommonPrefix = max(self.enumValues) + 1
		self._CommonMetadata = max(self.enumValues) + 2

		self._handleEntry = {
			'events': (DataProvider.NEntries, int, 'block entry counter'),
			'id': (None, None, 'dataset ID'), # legacy key - skip
			'metadata': (DataProvider.Metadata, parseJSON, 'metadata description'),
			'metadata common': (self._CommonMetadata, parseJSON, 'common metadata'),
			'nickname': (DataProvider.Nickname, str, 'dataset nickname'),
			'prefix': (self._CommonPrefix, str, 'common prefix'),
			'se list': (DataProvider.Locations, lambda value: parseList(value, ','), 'block location'),
		}

		(path, self._forcePrefix, self._filter) = utils.optSplit(datasetExpr, '@%')
		self._filename = config.resolvePath(path, True, 'Error resolving dataset file: %s' % path)
예제 #24
0
	def process_block(self, block):
		# Check entry consistency
		events = sum(imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList]))
		if block.setdefault(DataProvider.NEntries, events) != events:
			error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)'
			error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events)
			self._handle_error(error_msg, self._mode)
		return block
예제 #25
0
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		self._CommonPrefix = max(self.enumValues) + 1
		self._CommonMetadata = max(self.enumValues) + 2

		self._handleEntry = {
			'events': (DataProvider.NEntries, int, 'block entry counter'),
			'id': (DataProvider.DatasetID, int, 'dataset ID'),
			'metadata': (DataProvider.Metadata, parseJSON, 'metadata description'),
			'metadata common': (self._CommonMetadata, parseJSON, 'common metadata'),
			'nickname': (DataProvider.Nickname, str, 'dataset nickname'),
			'prefix': (self._CommonPrefix, str, 'common prefix'),
			'se list': (DataProvider.Locations, lambda value: parseList(value, ','), 'block location'),
		}

		(path, self._forcePrefix, self._filter) = utils.optSplit(datasetExpr, '@%')
		self._filename = config.resolvePath(path, True, 'Error resolving dataset file: %s' % path)
예제 #26
0
		def getFilterEntries():
			for pat in value.split():
				if ':' not in pat.lstrip(':'):
					yield pat
				else:
					for block in DataProvider.getBlocksFromExpr(config, ':%s' % pat.lstrip(':')):
						for fi in block[DataProvider.FileList]:
							yield fi[DataProvider.URL]
예제 #27
0
	def _create_block(self, block_name):
		result = {
			DataProvider.Locations: None,
			DataProvider.FileList: [],
			self._common_prefix: None,
			self._common_metadata: [],
		}
		result.update(DataProvider.parse_block_id(block_name.lstrip('[').rstrip(']')))
		return result
예제 #28
0
 def split_partitions(self, block_iter, entry_first=0):
     for block in block_iter:
         entries_per_job = self._entries_per_job.lookup(
             DataProvider.get_block_id(block))
         for proto_partition in self._partition_block(
                 block[DataProvider.FileList], entries_per_job,
                 entry_first):
             entry_first = 0
             yield self._finish_partition(block, proto_partition)
예제 #29
0
 def getFilterEntries():
     for pat in value.split():
         if ':' not in pat.lstrip(':'):
             yield pat
         else:
             for block in DataProvider.getBlocksFromExpr(
                     config, ':%s' % pat.lstrip(':')):
                 for fi in block[DataProvider.FileList]:
                     yield fi[DataProvider.URL]
예제 #30
0
 def __init__(self, block_list_old, block_list_new):
     activity = Activity('Performing resynchronization of dataset')
     block_resync_tuple = DataProvider.resync_blocks(
         block_list_old, block_list_new)
     (self.block_list_added, self._block_list_missing,
      self._block_list_matching) = block_resync_tuple
     for block_missing in self._block_list_missing:  # Files in matching blocks are already sorted
         sort_inplace(block_missing[DataProvider.FileList],
                      key=itemgetter(DataProvider.URL))
     activity.finish()
예제 #31
0
    def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0):
        DataProvider.__init__(self, config, datasetExpr, datasetNick,
                              datasetID)

        config = config.changeView(viewClass='SimpleConfigView',
                                   setSections=['datasource %s' % datasetExpr])
        self._block = self._readBlockFromConfig(config, datasetExpr,
                                                datasetNick, datasetID)

        dataset_hash_new = md5_hex(repr(self._block))
        dataset_hash_old = config.get('dataset hash',
                                      dataset_hash_new,
                                      persistent=True)
        self._request_resync = dataset_hash_new != dataset_hash_old
        if self._request_resync:
            self._log.critical('Dataset %r changed', datasetExpr)
            config.setState(True, 'resync', detail='dataset')
            config.setState(True, 'resync', detail='parameters')
            config.set('dataset hash', dataset_hash_new)
예제 #32
0
		def getFilterEntries():
			for pat in value.split():
				if ':' not in pat.lstrip(':'):
					yield pat
				else:
					for dfac in DataProvider.bind(':%s' % pat.lstrip(':'), config = config):
						dproc = dfac.getBoundInstance()
						for block in dproc.getBlocks():
							for fi in block[DataProvider.FileList]:
								yield fi[DataProvider.URL]
예제 #33
0
    def __init__(self, config, datasetExpr, datasetNick=None):
        DataProvider.__init__(self, config, datasetExpr, datasetNick)

        ds_config = config.changeView(
            viewClass='SimpleConfigView',
            setSections=['datasource %s' % datasetExpr])
        self._block = self._readBlockFromConfig(ds_config, datasetExpr,
                                                datasetNick)

        def onChange(config, old_obj, cur_obj, cur_entry, obj2str):
            self._log.critical('Dataset %r changed', datasetExpr)
            return triggerResync(['datasets',
                                  'parameters'])(config, old_obj, cur_obj,
                                                 cur_entry, obj2str)

        ds_config.get('dataset hash',
                      self.getHash(),
                      persistent=True,
                      onChange=onChange)
예제 #34
0
 def _create_block(self, block_name):
     result = {
         DataProvider.Locations: None,
         DataProvider.FileList: [],
         self._common_prefix: None,
         self._common_metadata: [],
     }
     result.update(
         DataProvider.parse_block_id(block_name.lstrip('[').rstrip(']')))
     return result
예제 #35
0
	def divide_blocks(self, block_iter):
		for block in block_iter:
			fi_idx_start = 0
			files_per_job = self._files_per_job.lookup(DataProvider.get_block_id(block))
			if files_per_job <= 0:
				raise PartitionError('Invalid number of files per job: %d' % files_per_job)
			while fi_idx_start < len(block[DataProvider.FileList]):
				fi_list = block[DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job]
				fi_idx_start += files_per_job
				if fi_list:
					yield self._create_sub_block(block, fi_list)
예제 #36
0
 def process_block(self, block):
     # Check entry consistency
     events = sum(
         imap(itemgetter(DataProvider.NEntries),
              block[DataProvider.FileList]))
     if block.setdefault(DataProvider.NEntries, events) != events:
         error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)'
         error_msg = error_msg % (DataProvider.get_block_id(block),
                                  block[DataProvider.NEntries], events)
         self._handle_error(error_msg, self._mode)
     return block
예제 #37
0
 def processBlock(self, block):
     # Check entry consistency
     events = sum(
         imap(lambda x: x[DataProvider.NEntries],
              block[DataProvider.FileList]))
     if block.setdefault(DataProvider.NEntries, events) != events:
         self._handleError(
             'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)'
             % (DataProvider.bName(block), block[DataProvider.NEntries],
                events), self._mode)
     return block
예제 #38
0
	def _get_fi_class(self, fi, block):
		metadata_name_list = block.get(DataProvider.Metadata, [])
		metadata_name_list_selected = self._metadata_user_list.lookup(DataProvider.get_block_id(block))
		metadata_idx_list = lmap(lambda metadata_name: safe_index(metadata_name_list, metadata_name),
			metadata_name_list_selected)

		def _query_metadata(idx):
			if (idx is not None) and (idx < len(fi[DataProvider.Metadata])):
				return fi[DataProvider.Metadata][idx]
			return ''
		return tuple(imap(_query_metadata, metadata_idx_list))
예제 #39
0
 def _get_filter_entries():
     for pat in value.split():
         if ':' not in pat.lstrip(':'):
             yield pat
         else:
             block_iter = DataProvider.iter_blocks_from_expr(
                 config,
                 ':%s' % pat.lstrip(':'),
                 dataset_proc=dataset_proc)
             for block in block_iter:
                 for fi in block[DataProvider.FileList]:
                     yield fi[DataProvider.URL]
예제 #40
0
    def _get_fi_class(self, fi, block):
        metadata_name_list = block.get(DataProvider.Metadata, [])
        metadata_name_list_selected = self._metadata_user_list.lookup(
            DataProvider.get_block_id(block))
        metadata_idx_list = lmap(
            lambda metadata_name: safe_index(metadata_name_list, metadata_name
                                             ), metadata_name_list_selected)

        def _query_metadata(idx):
            if (idx is not None) and (idx < len(fi[DataProvider.Metadata])):
                return fi[DataProvider.Metadata][idx]
            return ''

        return tuple(imap(_query_metadata, metadata_idx_list))
예제 #41
0
 def process_block(self, block):
     if block[DataProvider.Locations] is not None:
         sites = self._location_filter.filter_list(
             block[DataProvider.Locations])
         if (sites is not None) and (len(sites) == 0) and (len(
                 block[DataProvider.FileList]) != 0):
             error_msg = 'Block %s is not available ' % DataProvider.get_block_id(
                 block)
             if not len(block[DataProvider.Locations]):
                 self._log.warning(error_msg + 'at any site!')
             elif not len(sites):
                 self._log.warning(error_msg + 'at any selected site!')
         block[DataProvider.Locations] = sites
     return block
예제 #42
0
def discover_blocks(options):
	# Get work directory, create dbs dump directory
	if os.path.isdir(options.args[0]):
		workDir = os.path.abspath(os.path.normpath(options.args[0]))
	else:
		workDir = getConfig(configFile = options.args[0]).getWorkPath()
	if not options.tempdir:
		options.tempdir = os.path.join(workDir, 'dbs')
	if not os.path.exists(options.tempdir):
		os.mkdir(options.tempdir)

	# get provider with dataset information
	if options.input_file:
		provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None)
	else:
		config = getConfig(configDict = {'dataset': options.config_dict})
		provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None)

	blocks = provider.getBlocks()
	DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks)
	if options.opts.discovery:
		sys.exit(os.EX_OK)
	return blocks
예제 #43
0
	def divide_blocks(self, block_iter):
		for block in block_iter:
			(entries, fi_list) = (0, [])
			entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block))
			if entries_per_job <= 0:
				raise PartitionError('Invalid number of entries per job: %d' % entries_per_job)
			for fi in block[DataProvider.FileList]:
				if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job):
					yield self._create_sub_block(block, fi_list)
					(entries, fi_list) = (0, [])
				fi_list.append(fi)
				entries += fi[DataProvider.NEntries]
			if fi_list:
				yield self._create_sub_block(block, fi_list)
예제 #44
0
 def divide_blocks(self, block_iter):
     for block in block_iter:
         fi_idx_start = 0
         files_per_job = self._files_per_job.lookup(
             DataProvider.get_block_id(block))
         if files_per_job <= 0:
             raise PartitionError('Invalid number of files per job: %d' %
                                  files_per_job)
         while fi_idx_start < len(block[DataProvider.FileList]):
             fi_list = block[
                 DataProvider.FileList][fi_idx_start:fi_idx_start +
                                        files_per_job]
             fi_idx_start += files_per_job
             if fi_list:
                 yield self._create_sub_block(block, fi_list)
예제 #45
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)

        ds_config = config.change_view(
            view_class='SimpleConfigView',
            set_sections=['datasource %s' % dataset_expr])
        self._block = self._read_block(ds_config, dataset_expr, dataset_nick)

        def _on_change(config, old_obj, cur_obj, cur_entry, obj2str):
            self._log.critical('Dataset %r changed', dataset_expr)
            return TriggerResync(['datasets',
                                  'parameters'])(config, old_obj, cur_obj,
                                                 cur_entry, obj2str)

        ds_config.get('dataset hash',
                      self._get_dataset_hash(),
                      persistent=True,
                      on_change=_on_change)
예제 #46
0
    def processBlock(self, block):
        # Check uniqueness of URLs
        recordedBlockURL = []
        if self._checkURL != DatasetUniqueMode.ignore:

            def processFI(fiList):
                for fi in fiList:
                    urlHash = md5_hex(
                        repr((fi[DataProvider.URL], fi[DataProvider.NEntries],
                              fi.get(DataProvider.Metadata))))
                    if urlHash in self._recordedURL:
                        msg = 'Multiple occurences of URL: %r!' % fi[
                            DataProvider.URL]
                        msg += ' (This check can be configured with %r)' % 'dataset check unique url'
                        if self._checkURL == DatasetUniqueMode.warn:
                            self._log.warning(msg)
                        elif self._checkURL == DatasetUniqueMode.abort:
                            raise DatasetError(msg)
                        elif self._checkURL == DatasetUniqueMode.skip:
                            continue
                    self._recordedURL.add(urlHash)
                    recordedBlockURL.append(urlHash)
                    yield fi

            block[DataProvider.FileList] = list(
                processFI(block[DataProvider.FileList]))
            recordedBlockURL.sort()

        # Check uniqueness of blocks
        if self._checkBlock != DatasetUniqueMode.ignore:
            blockHash = md5_hex(
                repr((block.get(DataProvider.Dataset),
                      block[DataProvider.BlockName], recordedBlockURL,
                      block[DataProvider.NEntries],
                      block[DataProvider.Locations],
                      block.get(DataProvider.Metadata))))
            if blockHash in self._recordedBlock:
                msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName(
                    block)
                msg += ' (This check can be configured with %r)' % 'dataset check unique block'
                if self._checkBlock == DatasetUniqueMode.warn:
                    self._log.warning(msg)
                elif self._checkBlock == DatasetUniqueMode.abort:
                    raise DatasetError(msg)
                elif self._checkBlock == DatasetUniqueMode.skip:
                    return None
            self._recordedBlock.add(blockHash)
        return block
예제 #47
0
    def _read_block(self, ds_config, dataset_expr, dataset_nick):
        metadata_name_list = parse_json(
            ds_config.get('metadata', '[]', on_change=None))
        common_metadata = parse_json(
            ds_config.get('metadata common', '[]', on_change=None))
        if len(common_metadata) > len(metadata_name_list):
            raise DatasetError('Unable to set %d common metadata items ' %
                               len(common_metadata) + 'with %d metadata keys' %
                               len(metadata_name_list))
        common_prefix = ds_config.get('prefix', '', on_change=None)
        fn_list = []
        has_events = False
        has_se_list = False
        for url in ds_config.get_option_list():
            if url == 'se list':
                has_se_list = True
            elif url == 'events':
                has_events = True
            elif url not in [
                    'dataset hash', 'metadata', 'metadata common', 'nickname',
                    'prefix'
            ]:
                fi = self._read_fi(ds_config, url, metadata_name_list,
                                   common_metadata, common_prefix)
                fn_list.append(fi)
        if not fn_list:
            raise DatasetError(
                'There are no dataset files specified for dataset %r' %
                dataset_expr)

        result = {
            DataProvider.Nickname:
            ds_config.get('nickname', dataset_nick or '', on_change=None),
            DataProvider.FileList:
            sorted(fn_list, key=lambda fi: fi[DataProvider.URL])
        }
        result.update(DataProvider.parse_block_id(dataset_expr))
        if metadata_name_list:
            result[DataProvider.Metadata] = metadata_name_list
        if has_events:
            result[DataProvider.NEntries] = ds_config.get_int('events',
                                                              -1,
                                                              on_change=None)
        if has_se_list:
            result[DataProvider.Locations] = parse_list(
                ds_config.get('se list', '', on_change=None), ',')
        return result
예제 #48
0
 def divide_blocks(self, block_iter):
     for block in block_iter:
         (entries, fi_list) = (0, [])
         entries_per_job = self._entries_per_job.lookup(
             DataProvider.get_block_id(block))
         if entries_per_job <= 0:
             raise PartitionError('Invalid number of entries per job: %d' %
                                  entries_per_job)
         for fi in block[DataProvider.FileList]:
             if fi_list and (entries + fi[DataProvider.NEntries] >
                             entries_per_job):
                 yield self._create_sub_block(block, fi_list)
                 (entries, fi_list) = (0, [])
             fi_list.append(fi)
             entries += fi[DataProvider.NEntries]
         if fi_list:
             yield self._create_sub_block(block, fi_list)
예제 #49
0
	def resyncMapping(self, newSplitPath, oldBlocks, newBlocks):
		activity = Activity('Performing resynchronization of dataset')
		(blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks)
		for rmBlock in blocksMissing: # Files in matching blocks are already sorted
			sort_inplace(rmBlock[DataProvider.FileList], key = lambda x: x[DataProvider.URL])
		activity.finish()

		# User overview and setup starts here
		resultRedo = []
		resultDisable = []
		newSplitPathTMP = newSplitPath + '.tmp'
		resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching)
		self.savePartitions(newSplitPathTMP, resyncIter, sourceLenHint = self.getMaxJobs(),
			message = 'Performing resynchronization of dataset map (progress is estimated)')

		if self._interactive:
			# TODO: print info and ask
			if not utils.getUserBool('Do you want to use the new dataset partition?', False):
				return
		os.rename(newSplitPathTMP, newSplitPath)

		return (resultRedo, resultDisable)
예제 #50
0
	def processBlock(self, block):
		# Check uniqueness of URLs
		recordedBlockURL = []
		if self._checkURL != DatasetUniqueMode.ignore:
			def processFI(fiList):
				for fi in fiList:
					urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata))))
					if urlHash in self._recordedURL:
						msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL]
						msg += ' (This check can be configured with %r)' % 'dataset check unique url'
						if self._checkURL == DatasetUniqueMode.warn:
							self._log.warning(msg)
						elif self._checkURL == DatasetUniqueMode.abort:
							raise DatasetError(msg)
						elif self._checkURL == DatasetUniqueMode.skip:
							continue
					self._recordedURL.add(urlHash)
					recordedBlockURL.append(urlHash)
					yield fi
			block[DataProvider.FileList] = list(processFI(block[DataProvider.FileList]))
			recordedBlockURL.sort()

		# Check uniqueness of blocks
		if self._checkBlock != DatasetUniqueMode.ignore:
			blockHash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName],
				recordedBlockURL, block[DataProvider.NEntries],
				block[DataProvider.Locations], block.get(DataProvider.Metadata))))
			if blockHash in self._recordedBlock:
				msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName(block)
				msg += ' (This check can be configured with %r)' % 'dataset check unique block'
				if self._checkBlock == DatasetUniqueMode.warn:
					self._log.warning(msg)
				elif self._checkBlock == DatasetUniqueMode.abort:
					raise DatasetError(msg)
				elif self._checkBlock == DatasetUniqueMode.skip:
					return None
			self._recordedBlock.add(blockHash)
		return block
예제 #51
0
	def process_block(self, block):
		# Check uniqueness of URLs
		url_hash_list = []
		if self._check_url != DatasetUniqueMode.ignore:
			block[DataProvider.FileList] = list(self._process_fi_list(url_hash_list,
				block[DataProvider.FileList]))
			url_hash_list.sort()

		# Check uniqueness of blocks
		if self._check_block != DatasetUniqueMode.ignore:
			block_hash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName],
				url_hash_list, block[DataProvider.NEntries],
				block[DataProvider.Locations], block.get(DataProvider.Metadata))))
			if block_hash in self._recorded_block:
				msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(block)
				msg += ' (This check can be configured with %r)' % 'dataset check unique block'
				if self._check_block == DatasetUniqueMode.warn:
					self._log.warning(msg)
				elif self._check_block == DatasetUniqueMode.abort:
					raise DatasetError(msg)
				elif self._check_block == DatasetUniqueMode.skip:
					return None
			self._recorded_block.add(block_hash)
		return block
예제 #52
0
def main():
    usage = '%s [OPTIONS] <config file / work directory>' % sys.argv[0]
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-G', '--globaltag', dest='globaltag', default='crab2_tag', help='Specify global tag')
    parser.add_option('-F', '--input', dest='inputFile', default=None,
                      help='Specify dbs input file to use instead of scanning job output')
#    parser.add_option('-k', '--key-select',      dest='dataset key select', default='',
#        help='Specify dataset keys to process')
    parser.add_option('-c', '--continue-migration', dest='continue_migration', default=False, action='store_true',
                      help='Continue an already started migration')

    ogDiscover = optparse.OptionGroup(parser, 'Discovery options - ignored in case dbs input file is specified', '')
    ogDiscover.add_option('-n', '--name',        dest='dataset name pattern', default='',
        help='Specify dbs path name - Example: DataSet_@NICK@_@VAR@')
    ogDiscover.add_option('-T', '--datatype',    dest='datatype',      default=None,
        help='Supply dataset type in case cmssw report did not specify it - valid values: "mc" or "data"')
    ogDiscover.add_option('-m', '--merge',       dest='merge parents', default=False,  action='store_true',
        help='Merge output files from different parent blocks into a single block [Default: Keep boundaries]')
    ogDiscover.add_option('-j', '--jobhash',     dest='useJobHash',    default=False,  action='store_true',
        help='Use hash of all config files in job for dataset key calculation')
    ogDiscover.add_option('-u', '--unique-cfg',  dest='uniqueCfg',     default=False,  action='store_true',
        help='Circumvent edmConfigHash collisions so each dataset is stored with unique config information')
    ogDiscover.add_option('-P', '--parent',      dest='parent source', default='',
        help='Override parent information source - to bootstrap a reprocessing on local files')
    ogDiscover.add_option('-H', '--hash-keys',   dest='dataset hash keys', default='',
        help='Included additional variables in dataset hash calculation')
    parser.add_option_group(ogDiscover)

    ogDiscover2 = optparse.OptionGroup(parser, 'Discovery options II - only available when config file is used', '')
    ogDiscover2.add_option('-J', '--job-selector',    dest='selected',      default=None,
        help='Specify dataset(s) to process')
    parser.add_option_group(ogDiscover2)

    ogMode = optparse.OptionGroup(parser, 'Processing mode', '')
    ogMode.add_option('-b', '--batch',           dest='batch',         default=False, action='store_true',
        help='Enable non-interactive batch mode [Default: Interactive mode]')
    ogMode.add_option('-d', '--discovery',       dest='discovery',     default=False, action='store_true',
        help='Enable discovery mode - just collect file information and exit')
    ogMode.add_option('',   '--tempdir',         dest='tmpDir',        default='',
        help='Override temp directory')
    ogMode.add_option('-i', '--no-import',       dest='doImport',      default=True,  action='store_false',
        help='Disable import of new datasets into target DBS instance - only temporary xml files are created, ' +
            'which can be added later via datasetDBSTool.py [Default: Import datasets]')
    parser.add_option_group(ogMode)

    ogInc = optparse.OptionGroup(parser, 'Incremental adding of files to DBS', '')
    ogInc.add_option('-I', '--incremental',     dest='incremental',   default=False,  action='store_true',
        help='Skip import of existing files - Warning: this destroys coherent block structure!')
#	ogInc.add_option('-o', '--open-blocks',     dest='closeBlock',    default=True,   action='store_false',
#		help='Keep blocks open for addition of further files [Default: Close blocks]')
    parser.add_option_group(ogInc)

    ogInst = optparse.OptionGroup(parser, 'DBS instance handling', '')
    ogInst.add_option('-t', '--target-instance', dest='dbsTarget',
                      default='https://cmsweb.cern.ch/dbs/prod/phys03',
                      help='Specify target dbs instance url')
    ogInst.add_option('-s', '--source-instance', dest='dbsSource',
                      default='https://cmsweb.cern.ch/dbs/prod/global',
                      help='Specify source dbs instance url(s), where parent datasets are taken from')
    parser.add_option_group(ogInst)

    ogDbg = optparse.OptionGroup(parser, 'Display options', '')
    ogDbg.add_option('-D', '--display-dataset', dest='display_data',  default=None,
        help='Display information associated with dataset key(s) (accepts "all")')
    ogDbg.add_option('-C', '--display-config',  dest='display_cfg',   default=None,
        help='Display information associated with config hash(es) (accepts "all")')
    ogDbg.add_option('-v', '--verbose',         dest='verbosity',     default=0, action='count',
        help='Increase verbosity')
    parser.add_option_group(ogDbg)

    (opts, args) = parser.parse_args()
    utils.verbosity(opts.verbosity)
    setattr(opts, 'include parent infos', True)
    setattr(opts, 'importLumi', True)
    setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys').replace(',', ' '))
    if opts.useJobHash:
        setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys') + ' CMSSW_CONFIG_JOBHASH')

    # 0) Get work directory, create dbs dump directory
    if len(args) != 1:
        utils.exitWithUsage(usage, 'Neither work directory nor config file specified!')
    if os.path.isdir(args[0]):
        opts.workDir = os.path.abspath(os.path.normpath(args[0]))
    else:
        opts.workDir = getConfig(configFile=args[0]).getWorkPath()
    if not opts.tmpDir:
        opts.tmpDir = os.path.join(opts.workDir, 'dbs')
    if not os.path.exists(opts.tmpDir):
        os.mkdir(opts.tmpDir)
    # Lock file in case several instances of this program are running
    mutex = FileMutex(os.path.join(opts.tmpDir, 'datasetDBSAdd.lock'))

    # 1) Get dataset information
    if opts.inputFile:
        provider = DataProvider.getInstance('ListProvider', getConfig(), opts.inputFile, None)
    else:
        config = getConfig(configDict = {'dataset': dict(parser.values.__dict__)})
        if opts.discovery:
            config.set('dataset name pattern', '@DS_KEY@')
        provider = DataProvider.getInstance('DBSInfoProvider', config, args[0], None)

    provider.saveState(os.path.join(opts.tmpDir, 'dbs.dat'))
    if opts.discovery:
        sys.exit(os.EX_OK)
    blocks = provider.getBlocks()

    # 2) Filter datasets
    if opts.incremental:
        # Query target DBS for all found datasets and perform dataset resync with "supposed" state
        dNames = set(map(lambda b: b[DataProvider.Dataset], blocks))
        dNames = filter(lambda ds: hasDataset(opts.dbsTarget, ds), dNames)
        config = getConfig(configDict = {None: {'dbs instance': opts.dbsTarget}})
        oldBlocks = reduce(operator.add, map(lambda ds: DBSApiv2(config, None, ds, None).getBlocks(), dNames), [])
        (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldBlocks, blocks)
        if len(blocksMissing) or len(blocksChanged):
            if not utils.getUserBool(' * WARNING: Block structure has changed! Continue?', False):
                sys.exit(os.EX_OK)
        # Search for blocks which were partially added and generate "pseudo"-blocks with left over files
        setOldBlocks = set(map(lambda x: x[DataProvider.BlockName], oldBlocks))
        setAddedBlocks = set(map(lambda x: x[DataProvider.BlockName], blocksAdded))
        blockCollision = set.intersection(setOldBlocks, setAddedBlocks)
        if blockCollision and opts.closeBlock: # Block are closed and contents have changed
            for block in blocksAdded:
                if block[DataProvider.BlockName] in blockCollision:
                    block[DataProvider.BlockName] = utils.strGuid(md5(str(time.time())).hexdigest())
        blocks = blocksAdded

    # 3) Display dataset properties
    if opts.display_data or opts.display_cfg:
        raise APIError('Not yet reimplemented')

    #set-up logging
    logging.basicConfig(format='%(levelname)s: %(message)s')
    logger = logging.getLogger('dbs3-migration')
    logger.addHandler(NullHandler())
    logger.setLevel(logging.DEBUG)

    #set-up dbs clients
    dbs3_target_client = DBS3LiteClient(url=opts.dbsTarget)
    dbs3_source_client = DBS3LiteClient(url=opts.dbsSource)

    dbs3_migration_queue = DBS3MigrationQueue()

    for blockDump in generateDBS3BlockDumps(opts, blocks):
        if not opts.continue_migration:
            ###initiate the dbs3 to dbs3 migration of parent blocks
            logger.debug('Checking parentage for block: %s' % blockDump['block']['block_name'])
            unique_parent_lfns = set((parent[u'parent_logical_file_name'] for parent in blockDump[u'file_parent_list']))
            unique_blocks = set((block['block_name'] for parent_lfn in unique_parent_lfns
                                 for block in dbs3_source_client.listBlocks(logical_file_name=parent_lfn)))
            for block_to_migrate in unique_blocks:
                if dbs3_target_client.listBlocks(block_name=block_to_migrate):
                    #block already at destination
                    logger.debug('Block %s is already at destination' % block_to_migrate)
                    continue
                migration_task = MigrationTask(block_name=block_to_migrate,
                                               migration_url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader',
                                               dbs_client=dbs3_target_client)
                try:
                    dbs3_migration_queue.add_migration_task(migration_task)
                except AlreadyQueued as aq:
                    logger.debug(aq.message)

            dbs3_migration_queue.save_to_disk(os.path.join(opts.tmpDir, 'dbs3_migration.pkl'))
        else:
            try:
                dbs3_migration_queue = DBS3MigrationQueue.read_from_disk(os.path.join(opts.tmpDir,
                                                                                      'dbs3_migration.pkl'))
            except IOError as io_err:
                msg = "Probably, there is no DBS 3 migration for this dataset ongoing, Dude!"
                logger.exception('%s\n%s' % (io_err.message, msg))
                raise

        #wait for all parent blocks migrated to dbs3
        do_migration(dbs3_migration_queue)

        #insert block into dbs3
        dbs3_target_client.insertBulkBlock(blockDump)
예제 #53
0
	def __init__(self, config, datasetExpr, datasetNick, providerList):
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, None, self._log, 'Summary: Running over ')
		self._providerList = providerList
예제 #54
0
	def __init__(self, config, datasetExpr, datasetNick, datasetID, providerList):
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		self._providerList = providerList
		for provider in self._providerList:
			provider.setPassthrough()
예제 #55
0
	def resyncMapping(self, newSplitPath, oldBlocks, newBlocks):
		log = utils.ActivityLog('Performing resynchronization of dataset')
		(blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks)
		for rmBlock in blocksMissing: # Files in matching blocks are already sorted
			rmBlock[DataProvider.FileList].sort(lambda a, b: cmp(a[DataProvider.URL], b[DataProvider.URL]))
		del log

		# Get block information (oldBlock, newBlock, filesMissing, filesMatched) which splitInfo is based on
		def getMatchingBlock(splitInfo):
			# Comparison operator between dataset block and splitting
			def cmpSplitBlock(dsBlock, splitInfo):
				if dsBlock[DataProvider.Dataset] == splitInfo[DataSplitter.Dataset]:
					return cmp(dsBlock[DataProvider.BlockName], splitInfo[DataSplitter.BlockName])
				return cmp(dsBlock[DataProvider.Dataset], splitInfo[DataSplitter.Dataset])
			# Search for block in missing and matched blocks
			result = fast_search(blocksMissing, lambda x: cmpSplitBlock(x, splitInfo))
			if result:
				return (result, None, result[DataProvider.FileList], [])
			return fast_search(blocksMatching, lambda x: cmpSplitBlock(x[0], splitInfo)) # compare with old block

		#######################################
		# Process modifications of event sizes
		#######################################

		# Apply modification list to old splitting
		# Input: oldSplit, modList = [(rmfile, addfile), ...], doExpandOutside
		# With doExpandOutside, gc tries to handle expanding files via the splitting function
		def resyncSplitting(oldSplit, doExpandOutside, jobNum):
			if oldSplit.get(DataSplitter.Invalid, False):
				return (oldSplit, ResyncMode.ignore, [])

			(oldBlock, newBlock, filesMissing, filesMatched) = getMatchingBlock(oldSplit)

			modSI = copy.deepcopy(oldSplit)
			if newBlock:
				modSI[DataSplitter.Locations] = newBlock.get(DataProvider.Locations)
			# Determine size infos and get started
			search_url = lambda url: fast_search(oldBlock[DataProvider.FileList], lambda x: cmp(x[DataProvider.URL], url))
			sizeInfo = map(lambda url: search_url(url)[DataProvider.NEntries], modSI[DataSplitter.FileList])
			extended = []
			metaIdxLookup = []
			for meta in self.metaOpts:
				(oldIdx, newIdx) = (None, None)
				if oldBlock and (meta in oldBlock.get(DataProvider.Metadata, [])):
					oldIdx = oldBlock[DataProvider.Metadata].index(meta)
				if newBlock and (meta in newBlock.get(DataProvider.Metadata, [])):
					newIdx = newBlock[DataProvider.Metadata].index(meta)
				if (oldIdx != None) or (newIdx != None):
					metaIdxLookup.append((oldIdx, newIdx, self.metaOpts[meta]))

			# Select processing mode for job (disable > complete > changed > ignore) [ie. disable overrides all] using min
			# Result: one of [disable, complete, ignore] (changed -> complete or igore)
			procMode = ResyncMode.ignore

			# Remove files from splitting
			def removeFile(idx, rmFI):
				modSI[DataSplitter.Comment] += '[rm] ' + rmFI[DataProvider.URL]
				modSI[DataSplitter.Comment] += '-%d ' % rmFI[DataProvider.NEntries]

				if idx == len(modSI[DataSplitter.FileList]) - 1:
					# Removal of last file from current splitting
					modSI[DataSplitter.NEntries] = sum(sizeInfo) - modSI.get(DataSplitter.Skipped, 0)
					modSI[DataSplitter.Comment] += '[rm_last] '
				elif idx == 0:
					# Removal of first file from current splitting
					modSI[DataSplitter.NEntries] += max(0, sizeInfo[idx] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries])
					modSI[DataSplitter.NEntries] += modSI.get(DataSplitter.Skipped, 0)
					modSI[DataSplitter.Skipped] = 0
					modSI[DataSplitter.Comment] += '[rm_first] '
				else:
					# File in the middle is affected - solution very simple :)
					modSI[DataSplitter.Comment] += '[rm_middle] '

				modSI[DataSplitter.NEntries] -= rmFI[DataProvider.NEntries]
				modSI[DataSplitter.FileList].pop(idx)
				sizeInfo.pop(idx)


			# Process changed files in splitting - returns True if file index should be increased
			def changeFile(idx, oldFI, newFI):
				modSI[DataSplitter.Comment] += '[changed] ' + oldFI[DataProvider.URL]
				modSI[DataSplitter.Comment] += (' -%d ' % oldFI[DataProvider.NEntries])
				modSI[DataSplitter.Comment] += (' +%d ' % newFI[DataProvider.NEntries])

				def removeCompleteFile():
					modSI[DataSplitter.NEntries] -= oldFI[DataProvider.NEntries]
					modSI[DataSplitter.FileList].pop(idx)
					sizeInfo.pop(idx)

				def replaceCompleteFile():
					modSI[DataSplitter.NEntries] += newFI[DataProvider.NEntries]
					modSI[DataSplitter.NEntries] -= oldFI[DataProvider.NEntries]
					sizeInfo[idx] = newFI[DataProvider.NEntries]

				def expandOutside():
					fileList = newBlock.pop(DataProvider.FileList)
					newBlock[DataProvider.FileList] = [newFI]
					for extSplit in self.splitDatasetInternal([newBlock], oldFI[DataProvider.NEntries]):
						extSplit[DataSplitter.Comment] = oldSplit[DataSplitter.Comment] + '[ext_1] '
						extended.append(extSplit)
					newBlock[DataProvider.FileList] = fileList
					sizeInfo[idx] = newFI[DataProvider.NEntries]

				if idx == len(modSI[DataSplitter.FileList]) - 1:
					coverLast = modSI.get(DataSplitter.Skipped, 0) + modSI[DataSplitter.NEntries] - sum(sizeInfo[:-1])
					if coverLast == oldFI[DataProvider.NEntries]:
						# Change of last file, which ends in current splitting
						if doExpandOutside and (oldFI[DataProvider.NEntries] < newFI[DataProvider.NEntries]):
							expandOutside()
							modSI[DataSplitter.Comment] += '[last_add_1] '
						else:
							replaceCompleteFile()
							modSI[DataSplitter.Comment] += '[last_add_2] '
					elif coverLast > newFI[DataProvider.NEntries]:
						# Change of last file, which changes current coverage
						modSI[DataSplitter.NEntries] -= coverLast
						modSI[DataSplitter.NEntries] += oldFI[DataProvider.NEntries]
						replaceCompleteFile()
						modSI[DataSplitter.Comment] += '[last_add_3] '
					else:
						# Change of last file outside of current splitting
						sizeInfo[idx] = newFI[DataProvider.NEntries]
						modSI[DataSplitter.Comment] += '[last_add_4] '

				elif idx == 0:
					# First file is affected
					if (newFI[DataProvider.NEntries] > modSI.get(DataSplitter.Skipped, 0)):
						# First file changes and still lives in new splitting
						following = sizeInfo[0] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries]
						shrinkage = oldFI[DataProvider.NEntries] - newFI[DataProvider.NEntries]
						if following > 0:
							# First file not completely covered by current splitting
							if following < shrinkage:
								# Covered area of first file shrinks
								modSI[DataSplitter.NEntries] += following
								replaceCompleteFile()
								modSI[DataSplitter.Comment] += '[first_add_1] '
							else:
								# First file changes outside of current splitting
								sizeInfo[idx] = newFI[DataProvider.NEntries]
								modSI[DataSplitter.Comment] = '[first_add_2] '
						else:
							# Change of first file ending in current splitting - One could try to
							# 'reverse fix' expanding files to allow expansion via adding only the expanding part
							replaceCompleteFile()
							modSI[DataSplitter.Comment] += '[first_add_3] '
					else:
						# Removal of first file from current splitting
						modSI[DataSplitter.NEntries] += max(0, sizeInfo[idx] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries])
						modSI[DataSplitter.NEntries] += modSI.get(DataSplitter.Skipped, 0)
						modSI[DataSplitter.Skipped] = 0
						removeCompleteFile()
						return False

				else:
					# File in the middle is affected - solution very simple :)
					# Replace file - expanding files could be swapped to the (fully contained) end
					# to allow expansion via adding only the expanding part
					replaceCompleteFile()
					modSI[DataSplitter.Comment] += '[middle_add_1] '
				return True

			idx = 0
			newMetadata = []
			while idx < len(modSI[DataSplitter.FileList]):
				url = modSI[DataSplitter.FileList][idx]

				rmFI = fast_search(filesMissing, lambda x: cmp(x[DataProvider.URL], url))
				if rmFI:
					removeFile(idx, rmFI)
					procMode = min(procMode, self.mode_removed)
					for meta in modSI.get(DataSplitter.MetadataHeader, []):
						procMode = min(procMode, self.metaOpts.get(meta, ResyncMode.ignore))
					continue # dont increase filelist index!

				(oldFI, newFI) = fast_search(filesMatched, lambda x: cmp(x[0][DataProvider.URL], url))
				if DataProvider.Metadata in newFI:
					newMetadata.append(newFI[DataProvider.Metadata])
					for (oldMI, newMI, metaProc) in metaIdxLookup:
						if (oldMI == None) or (newMI == None):
							procMode = min(procMode, metaProc) # Metadata was removed
						elif (oldFI[DataProvider.Metadata][oldMI] != newFI[DataProvider.Metadata][newMI]):
							procMode = min(procMode, metaProc) # Metadata was changed
				if oldFI[DataProvider.NEntries] == newFI[DataProvider.NEntries]:
					idx += 1
					continue
				oldEvts = modSI[DataSplitter.NEntries]
				oldSkip = modSI[DataSplitter.Skipped]

				if changeFile(idx, oldFI, newFI):
					pass
					idx += 1

				mode = utils.QM(oldFI[DataProvider.NEntries] < newFI[DataProvider.NEntries], self.mode_expanded, self.mode_shrunken)
				if mode == ResyncMode.changed:
					changed = (oldEvts != modSI[DataSplitter.NEntries]) or (oldSkip != modSI[DataSplitter.Skipped])
					mode = utils.QM(changed, ResyncMode.complete, ResyncMode.ignore)
				procMode = min(procMode, mode)
				continue

			# Disable invalid / invalidated splittings
			if (len(modSI[DataSplitter.FileList]) == 0) or (modSI[DataSplitter.NEntries] <= 0):
				procMode = ResyncMode.disable

			if procMode == ResyncMode.disable:
				modSI[DataSplitter.Invalid] = True
				return (modSI, ResyncMode.disable, []) # Discard extensions

			# Update metadata
			if DataSplitter.Metadata in modSI:
				modSI.pop(DataSplitter.MetadataHeader)
				modSI.pop(DataSplitter.Metadata)
			if newMetadata:
				modSI[DataSplitter.MetadataHeader] = newBlock.get(DataProvider.Metadata)
				modSI[DataSplitter.Metadata] = newMetadata

			return (modSI, procMode, extended)

		# Process splittings
		def resyncIterator_raw():
			extList = []
			# Perform resync of existing splittings
			for jobNum in range(self.getMaxJobs()):
				splitInfo = self.getSplitInfo(jobNum)
				if DataSplitter.Comment not in splitInfo:
					splitInfo[DataSplitter.Comment] = 'src: %d ' % jobNum
				(modSplitInfo, procMode, extended) = resyncSplitting(splitInfo, True, jobNum)
				if (self.resyncOrder == ResyncOrder.append) and (procMode == ResyncMode.complete):
					extList.append(modSplitInfo)
					modSplitInfo = copy.copy(splitInfo)
					modSplitInfo[DataSplitter.Invalid] = True
					procMode = ResyncMode.disable
				extList.extend(extended)
				yield (jobNum, modSplitInfo, procMode)
			# Yield collected extensions of existing splittings
			for extSplitInfo in extList:
				yield (None, extSplitInfo, ResyncMode.ignore)
			# Yield completely new splittings
			if self.mode_new == ResyncMode.complete:
				for newSplitInfo in self.splitDatasetInternal(blocksAdded):
					yield (None, newSplitInfo, ResyncMode.ignore)

		def getSplitContainer():
			(rawInfo, extInfo) = ([], [])
			for (jobNum, splitInfo, procMode) in resyncIterator_raw():
				if jobNum != None: # Separate existing and new splittings
					rawInfo.append((jobNum, splitInfo, procMode))
				else:
					extInfo.append((None, splitInfo, None))
			return (rawInfo, extInfo)

		def getReorderIterator(mainIter, altIter): # alt source is used if main source contains invalid entries
			for (jobNum, splitInfo, procMode) in mainIter:
				if splitInfo.get(DataSplitter.Invalid, False) or (procMode == ResyncMode.disable):
					extInfo = next(altIter, None)
					while extInfo and extInfo[1].get(DataSplitter.Invalid, False):
						extInfo = next(altIter, None)
					if extInfo:
						yield (jobNum, extInfo[1], ResyncMode.complete) # Overwrite invalid splittings
						continue
				yield (jobNum, splitInfo, procMode)
			for extInfo in altIter:
				yield (None, extInfo[1], ResyncMode.ignore)

		# Use reordering if setup - log interventions (disable, redo) according to procMode
		resultRedo = []
		resultDisable = []
		def resyncIterator():
			if self.resyncOrder == ResyncOrder.fillgap:
				rawInfo, extInfo = getSplitContainer()
				resyncIter = getReorderIterator(rawInfo, iter(extInfo))
			elif self.resyncOrder == ResyncOrder.reorder:
				rawInfo, extInfo = getSplitContainer()
				tsi = utils.TwoSidedIterator(rawInfo + extInfo)
				resyncIter = getReorderIterator(tsi.forward(), tsi.backward())
			else:
				resyncIter = resyncIterator_raw()

			for (jobNum, splitInfo, procMode) in resyncIter:
				if jobNum:
					if procMode == ResyncMode.complete:
						resultRedo.append(jobNum)
					if procMode == ResyncMode.disable:
						resultDisable.append(jobNum)
				yield splitInfo

		# User overview and setup starts here
		newSplitPathTMP = newSplitPath + '.tmp'
		self.saveState(newSplitPathTMP, resyncIterator(), sourceLen = self.getMaxJobs(),
			message = 'Performing resynchronization of dataset map (progress is estimated)')

		if self.interactive:
			# TODO: print info and ask
			if not getUserBool('Do you want to use the new dataset splitting?', False):
				return None
		os.rename(newSplitPathTMP, newSplitPath)

		return (resultRedo, resultDisable)
예제 #56
0
	def __init__(self, config, datasetExpr, datasetNick, sList):
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		(self._ds_select, self._ds_name, self._ds_keys_user, self._ds_keys_guard) = self._setup(config, 'dataset')
		(self._b_select, self._b_name, self._b_keys_user, self._b_keys_guard) = self._setup(config, 'block')
		scanList = config.getList('scanner', sList) + ['NullScanner']
		self._scanner = lmap(lambda cls: InfoScanner.createInstance(cls, config), scanList)
예제 #57
0
		def _filter_block(block):
			if self._filter:
				return self._filter in '/%s#' % DataProvider.get_block_id(block)
			return True