def dataset_show_diff(options): if len(options.args) != 2: options.parser.exit_with_usage(options.parser.usage('data')) provider_a = DataProvider.load_from_file(options.args[0]) provider_b = DataProvider.load_from_file(options.args[1]) block_resync_tuple = DataProvider.resync_blocks(provider_a.get_block_list_cached(show_stats=False), provider_b.get_block_list_cached(show_stats=False)) (block_list_added, block_list_missing, block_list_matching) = block_resync_tuple def _dataset_iter_matching_blocks(): for (block_old, block_new, _, _) in block_list_matching: def _format_change(old, new): if old != new: return '%s -> %s' % (old, new) return old block_old[DataProvider.NFiles] = _format_change(len(block_old.get(DataProvider.FileList, [])), len(block_new.get(DataProvider.FileList, []))) block_old[DataProvider.NEntries] = _format_change(block_old[DataProvider.NEntries], block_new[DataProvider.NEntries]) yield block_old header_list = [(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries')] if block_list_added: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_added), title='Added blocks') if block_list_missing: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_missing), title='Removed blocks') if block_list_matching: ConsoleTable.create(header_list, _dataset_iter_matching_blocks(), title='Matching blocks')
def _display_setup(self, dataset_fn, head): if os.path.exists(dataset_fn): nick_name_set = set() for block in DataProvider.load_from_file( dataset_fn).get_block_list_cached(show_stats=False): nick_name_set.add(block[DataProvider.Nickname]) self._log.info('Mapping between nickname and other settings:') report = [] def _get_dataset_lookup_psrc(psrc): is_lookup_cls = isinstance( psrc, ParameterSource.get_class('LookupBaseParameterSource')) return is_lookup_cls and ('DATASETNICK' in psrc.get_parameter_deps()) ps_lookup = lfilter(_get_dataset_lookup_psrc, self._source.get_used_psrc_list()) for nick in sorted(nick_name_set): tmp = {'DATASETNICK': nick} for src in ps_lookup: src.fill_parameter_content(None, tmp) tmp[1] = str.join( ', ', imap(os.path.basename, self._nm_cfg.lookup(nick, '', is_selector=False))) tmp[2] = str_lumi_nice( self._nm_lumi.lookup(nick, '', is_selector=False)) report.append(tmp) ConsoleTable.create(head, report, 'cl')
def _resync_psrc(self): activity = Activity('Performing resync of datasource %r' % self.get_datasource_name()) # Get old and new dataset information provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat')) block_list_old = provider_old.get_block_list_cached(show_stats=False) self._provider.clear_cache() block_list_new = self._provider.get_block_list_cached(show_stats=False) self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new) # Use old splitting information to synchronize with new dataset infos partition_len_old = self.get_parameter_len() partition_changes = self._resync_partitions( self._get_data_path('map-new.tar'), block_list_old, block_list_new) activity.finish() if partition_changes is not None: # Move current splitting to backup and use the new splitting from now on def _rename_with_backup(new, cur, old): if self._keep_old: os.rename(self._get_data_path(cur), self._get_data_path(old)) os.rename(self._get_data_path(new), self._get_data_path(cur)) _rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time()) _rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time()) self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar'))) self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len) (pnum_list_redo, pnum_list_disable) = partition_changes return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
def dataset_show_diff(options): if len(options.args) != 2: options.parser.exit_with_usage(options.parser.usage('data')) provider_a = DataProvider.load_from_file(options.args[0]) provider_b = DataProvider.load_from_file(options.args[1]) block_resync_tuple = DataProvider.resync_blocks( provider_a.get_block_list_cached(show_stats=False), provider_b.get_block_list_cached(show_stats=False)) (block_list_added, block_list_missing, block_list_matching) = block_resync_tuple def _dataset_iter_matching_blocks(): for (block_old, block_new, _, _) in block_list_matching: def _format_change(old, new): if old != new: return '%s -> %s' % (old, new) return old block_old[DataProvider.NFiles] = _format_change( len(block_old.get(DataProvider.FileList, [])), len(block_new.get(DataProvider.FileList, []))) block_old[DataProvider.NEntries] = _format_change( block_old[DataProvider.NEntries], block_new[DataProvider.NEntries]) yield block_old header_list = [(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries')] if block_list_added: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_added), title='Added blocks') if block_list_missing: ConsoleTable.create(header_list, dataset_iter_blocks(block_list_missing), title='Removed blocks') if block_list_matching: ConsoleTable.create(header_list, _dataset_iter_matching_blocks(), title='Matching blocks')
def dataset_show_removed(options): if len(options.args) < 2: options.parser.exit_with_usage(options.parser.usage('data')) block_list_missing = [] provider_old = DataProvider.load_from_file(options.args[0]) for dataset_fn in options.args[1:]: provider_new = DataProvider.load_from_file(dataset_fn) block_resync_tuple = DataProvider.resync_blocks( provider_old.get_block_list_cached(show_stats=False), provider_new.get_block_list_cached(show_stats=False)) for block in block_resync_tuple[1]: # iterate missing block list tmp = dict(block) tmp[DataProvider.RemovedIn] = dataset_fn block_list_missing.append(tmp) provider_old = provider_new if block_list_missing: ConsoleTable.create([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries'), (DataProvider.RemovedIn, 'Removed in file')], dataset_iter_blocks(block_list_missing), title='Removed blocks')
def dataset_show_removed(options): if len(options.args) < 2: options.parser.exit_with_usage(options.parser.usage('data')) block_list_missing = [] provider_old = DataProvider.load_from_file(options.args[0]) for dataset_fn in options.args[1:]: provider_new = DataProvider.load_from_file(dataset_fn) block_resync_tuple = DataProvider.resync_blocks( provider_old.get_block_list_cached(show_stats=False), provider_new.get_block_list_cached(show_stats=False)) for block in block_resync_tuple[1]: # iterate missing block list tmp = dict(block) tmp[DataProvider.RemovedIn] = dataset_fn block_list_missing.append(tmp) provider_old = provider_new if block_list_missing: ConsoleTable.create([(DataProvider.Dataset, 'Dataset'), (DataProvider.BlockName, 'Block'), (DataProvider.NFiles, '#Files'), (DataProvider.NEntries, '#Entries'), (DataProvider.RemovedIn, 'Removed in file')], dataset_iter_blocks(block_list_missing), title='Removed blocks')
def _init_reader(self): # look for aborted inits / resyncs - and try to restore old state if possible if self._exists_data_path('map.tar.resync') and self._exists_data_path('cache.dat.resync'): rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar')) elif self._exists_data_path('map.tar.resync') or self._exists_data_path('cache.dat.resync'): raise DatasetError('Found broken dataset partition resync state in work directory') if self._exists_data_path('map.tar') and not self._exists_data_path('cache.dat'): raise DatasetError('Found broken dataset partition in work directory') elif not self._exists_data_path('map.tar'): # create initial partition map file if not self._exists_data_path('cache.dat'): provider = self._provider else: provider = DataProvider.load_from_file(self._get_data_path('cache.dat')) block_iter = DataProvider.save_to_file_iter(self._get_data_path('cache.dat.init'), provider.get_block_list_cached(show_stats=True)) partition_iter = self._splitter.split_partitions(block_iter) DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter) rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar')) return DataSplitter.load_partitions(self._get_data_path('map.tar'))
def _init_reader(self): # look for aborted inits / resyncs - and try to restore old state if possible if self._exists_data_path('map.tar.resync') and self._exists_data_path( 'cache.dat.resync'): rename_file(self._get_data_path('cache.dat.resync'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.resync'), self._get_data_path('map.tar')) elif self._exists_data_path( 'map.tar.resync') or self._exists_data_path( 'cache.dat.resync'): raise DatasetError( 'Found broken dataset partition resync state in work directory' ) if self._exists_data_path( 'map.tar') and not self._exists_data_path('cache.dat'): raise DatasetError( 'Found broken dataset partition in work directory') elif not self._exists_data_path('map.tar'): # create initial partition map file if not self._exists_data_path('cache.dat'): provider = self._provider else: provider = DataProvider.load_from_file( self._get_data_path('cache.dat')) block_iter = DataProvider.save_to_file_iter( self._get_data_path('cache.dat.init'), provider.get_block_list_cached(show_stats=True)) partition_iter = self._splitter.split_partitions(block_iter) DataSplitter.save_partitions(self._get_data_path('map.tar.init'), partition_iter) rename_file(self._get_data_path('cache.dat.init'), self._get_data_path('cache.dat')) rename_file(self._get_data_path('map.tar.init'), self._get_data_path('map.tar')) return DataSplitter.load_partitions(self._get_data_path('map.tar'))
def _resync_psrc(self): activity = Activity('Performing resync of datasource %r' % self.get_datasource_name()) # Get old and new dataset information provider_old = DataProvider.load_from_file( self._get_data_path('cache.dat')) block_list_old = provider_old.get_block_list_cached(show_stats=False) self._provider.clear_cache() block_list_new = self._provider.get_block_list_cached(show_stats=False) self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new) # Use old splitting information to synchronize with new dataset infos partition_len_old = self.get_parameter_len() partition_changes = self._resync_partitions( self._get_data_path('map-new.tar'), block_list_old, block_list_new) activity.finish() if partition_changes is not None: # Move current splitting to backup and use the new splitting from now on def _rename_with_backup(new, cur, old): if self._keep_old: os.rename(self._get_data_path(cur), self._get_data_path(old)) os.rename(self._get_data_path(new), self._get_data_path(cur)) _rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time()) _rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time()) self._set_reader( DataSplitter.load_partitions(self._get_data_path('map.tar'))) self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len) (pnum_list_redo, pnum_list_disable) = partition_changes return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)