Exemplo n.º 1
0
	def __init__(self, config, datasource_name, dataset_expr,
			dataset_nick, dataset_proc, scanner_list_default):
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		# Configure scanners
		scanner_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters']))
		self._interactive_assignment = config.is_interactive('dataset name assignment', True)

		def _create_scanner(scanner_name):
			return InfoScanner.create_instance(scanner_name, scanner_config, datasource_name)
		scanner_list = scanner_config.get_list('scanner', scanner_list_default) + ['NullScanner']
		self._scanner_list = lmap(_create_scanner, scanner_list)

		# Configure dataset / block naming and selection
		def _setup(prefix):
			selected_hash_list = scanner_config.get_list(join_config_locations(prefix, 'key select'), [])
			name = scanner_config.get(join_config_locations(prefix, 'name pattern'), '')
			return (selected_hash_list, name)
		(self._selected_hash_list_dataset, self._dataset_pattern) = _setup('dataset')
		(self._selected_hash_list_block, self._block_pattern) = _setup('block')

		# Configure hash input for separation of files into datasets / blocks
		def _get_active_hash_input(prefix, guard_entry_idx):
			hash_input_list_user = scanner_config.get_list(join_config_locations(prefix, 'hash keys'), [])
			hash_input_list_guard = scanner_config.get_list(join_config_locations(prefix, 'guard override'),
				lchain(imap(lambda scanner: scanner.get_guard_keysets()[guard_entry_idx], self._scanner_list)))
			return hash_input_list_user + hash_input_list_guard
		self._hash_input_set_dataset = _get_active_hash_input('dataset', 0)
		self._hash_input_set_block = _get_active_hash_input('block', 1)
Exemplo n.º 2
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._lumi_filter = dataset_config.get_lookup(
            ['lumi filter', '%s lumi filter' % datasource_name],
            default={},
            parser=parse_lumi_filter,
            strfun=str_lumi)
        if not self._lumi_filter.empty():
            config.set('%s processor' % datasource_name, 'LumiDataProcessor',
                       '+=')
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = dataset_config.get_bool(
            ['lumi metadata',
             '%s lumi metadata' % datasource_name],
            default=not self._lumi_filter.empty())
        config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedex_filter = dataset_config.get_filter(
            'phedex sites',
            '-* T1_*_Disk T2_* T3_*',
            default_matcher='BlackWhiteMatcher',
            default_filter='StrictListFilter')
        self._only_complete = dataset_config.get_bool('only complete sites',
                                                      True)
        self._only_valid = dataset_config.get_bool('only valid', True)
        self._allow_phedex = dataset_config.get_bool('allow phedex', True)
        self._location_format = dataset_config.get_enum(
            'location format', CMSLocationFormat, CMSLocationFormat.hostname)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        dataset_expr_parts = split_opt(dataset_expr, '@#')
        (self._dataset_path, self._dataset_instance,
         self._dataset_block_selector) = dataset_expr_parts
        instance_default = dataset_config.get('dbs instance', '')
        self._dataset_instance = self._dataset_instance or instance_default
        if not self._dataset_instance:
            self._dataset_instance = 'prod/global'
        elif '/' not in self._dataset_instance:
            self._dataset_instance = 'prod/%s' % self._dataset_instance
        self._dataset_block_selector = self._dataset_block_selector or 'all'
Exemplo n.º 3
0
 def __init__(self,
              config,
              datasource_name,
              dataset_expr,
              dataset_nick=None,
              dataset_proc=None):
     CMSBaseProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
     self._url = config.get('das instance',
                            'https://cmsweb.cern.ch/das/cache',
                            on_change=TriggerResync(
                                ['datasets', 'parameters']))
     if self._dataset_instance.startswith('http'):
         self._url = self._dataset_instance
         self._dataset_instance = ''
     self._gjrc = DASRestClient(get_cms_cert(config), self._url,
                                'VOMS proxy needed to query DAS!',
                                UserError)
Exemplo n.º 4
0
 def __new__(cls, config, datasource_name, repository, keep_old=True):
     provider_name_default = config.get(
         ['default provider',
          '%s provider' % datasource_name], 'ListProvider')
     provider = config.get_composited_plugin(
         datasource_name,
         '',
         ':ThreadedMultiDatasetProvider:',
         cls=DataProvider,
         require_plugin=False,
         on_change=TriggerResync(['datasets', 'parameters']),
         bind_kwargs={
             'datasource_name': datasource_name,
             'provider_name_default': provider_name_default
         })
     if not provider:
         return NullParameterSource()
     instance = BaseDataParameterSource.__new__(cls)
     instance.provider = provider
     return instance
Exemplo n.º 5
0
 def __init__(self, config, datasource_name, repository, reader=None):
     LimitedResyncParameterSource.__init__(self)
     # needed for backwards compatible file names: datacache/datamap
     self._name = datasource_name.replace('dataset', 'data')
     (self._reader, self._len) = (None, None)
     self._set_reader(reader)
     self._part_proc = config.get_composited_plugin(
         [
             'partition processor',
             '%s partition processor' % datasource_name
         ],
         'TFCPartitionProcessor LocationPartitionProcessor ' +
         'MetaPartitionProcessor BasicPartitionProcessor',
         'MultiPartitionProcessor',
         cls=PartitionProcessor,
         on_change=TriggerResync(['parameters']),
         pargs=(datasource_name, ))
     self._log.debug('%s: Using partition processor %s', datasource_name,
                     repr(self._part_proc))
     repository['dataset:%s' % self._name] = self
Exemplo n.º 6
0
    def _setup_repository(self, config, psrc_repository):
        TaskModule._setup_repository(self, config, psrc_repository)

        psrc_list = []
        for datasource_name in config.get_list(
                'datasource names', ['dataset'],
                on_change=TriggerResync(['datasets', 'parameters'])):
            data_config = config.change_view(view_class='TaggedConfigView',
                                             add_sections=[datasource_name])
            self._create_datasource(data_config, datasource_name,
                                    psrc_repository, psrc_list)
        self._has_dataset = (psrc_list != [])

        # Register signal handler for manual dataset refresh
        def _external_refresh(sig, frame):
            for psrc in psrc_list:
                self._log.info(
                    'External signal triggered resync of datasource %r',
                    psrc.get_datasource_name())
                psrc.setup_resync(force=True)

        signal.signal(signal.SIGUSR2, _external_refresh)

        config.set_state(False, 'resync', detail='datasets')
Exemplo n.º 7
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        ConfigurablePlugin.__init__(self, config)
        self._log = logging.getLogger('%s.provider' % datasource_name)
        (self._datasource_name, self._dataset_expr) = (datasource_name,
                                                       dataset_expr)
        self._dataset_nick_override = dataset_nick
        (self._cache_block, self._cache_dataset) = (None, None)
        self._dataset_query_interval = config.get_time(
            '%s default query interval' % datasource_name, 60, on_change=None)

        self._stats = dataset_proc or DataProcessor.create_instance(
            'SimpleStatsDataProcessor', config, datasource_name, self._log,
            ' * Dataset %s:\n\tcontains ' % repr(dataset_nick or dataset_expr))

        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._nick_producer = dataset_config.get_plugin(
            ['nickname source',
             '%s nickname source' % datasource_name],
            'SimpleNickNameProducer',
            cls=DataProcessor,
            pargs=(datasource_name, ))
        self._dataset_processor = dataset_proc or dataset_config.get_composited_plugin(
            '%s processor' % datasource_name,
            'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor '
            +
            'URLCountDataProcessor EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor '
            + 'LocationDataProcessor',
            'MultiDataProcessor',
            cls=DataProcessor,
            pargs=(datasource_name, ))
Exemplo n.º 8
0
 def _on_change(config, old_obj, cur_obj, cur_entry, obj2str):
     self._log.critical('Dataset %r changed', dataset_expr)
     return TriggerResync(['datasets',
                           'parameters'])(config, old_obj, cur_obj,
                                          cur_entry, obj2str)