def split_partitions(self, block_iter, entry_first=0): for block in block_iter: entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block)) for proto_partition in self._partition_block(block[DataProvider.FileList], entries_per_job, entry_first): entry_first = 0 yield self._finish_partition(block, proto_partition)
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list( self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def process_block(self, block): # Check entry consistency events = sum(imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def split_partitions(self, block_iter, entry_first=0): for block in block_iter: entries_per_job = self._entries_per_job.lookup( DataProvider.get_block_id(block)) for proto_partition in self._partition_block( block[DataProvider.FileList], entries_per_job, entry_first): entry_first = 0 yield self._finish_partition(block, proto_partition)
def _get_fi_class(self, fi, block): metadata_name_list = block.get(DataProvider.Metadata, []) metadata_name_list_selected = self._metadata_user_list.lookup(DataProvider.get_block_id(block)) metadata_idx_list = lmap(lambda metadata_name: safe_index(metadata_name_list, metadata_name), metadata_name_list_selected) def _query_metadata(idx): if (idx is not None) and (idx < len(fi[DataProvider.Metadata])): return fi[DataProvider.Metadata][idx] return '' return tuple(imap(_query_metadata, metadata_idx_list))
def divide_blocks(self, block_iter): for block in block_iter: fi_idx_start = 0 files_per_job = self._files_per_job.lookup(DataProvider.get_block_id(block)) if files_per_job <= 0: raise PartitionError('Invalid number of files per job: %d' % files_per_job) while fi_idx_start < len(block[DataProvider.FileList]): fi_list = block[DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job] fi_idx_start += files_per_job if fi_list: yield self._create_sub_block(block, fi_list)
def process_block(self, block): # Check entry consistency events = sum( imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def divide_blocks(self, block_iter): for block in block_iter: (entries, fi_list) = (0, []) entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block)) if entries_per_job <= 0: raise PartitionError('Invalid number of entries per job: %d' % entries_per_job) for fi in block[DataProvider.FileList]: if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job): yield self._create_sub_block(block, fi_list) (entries, fi_list) = (0, []) fi_list.append(fi) entries += fi[DataProvider.NEntries] if fi_list: yield self._create_sub_block(block, fi_list)
def process_block(self, block): if block[DataProvider.Locations] is not None: sites = self._location_filter.filter_list( block[DataProvider.Locations]) if (sites is not None) and (len(sites) == 0) and (len( block[DataProvider.FileList]) != 0): error_msg = 'Block %s is not available ' % DataProvider.get_block_id( block) if not len(block[DataProvider.Locations]): self._log.warning(error_msg + 'at any site!') elif not len(sites): self._log.warning(error_msg + 'at any selected site!') block[DataProvider.Locations] = sites return block
def _get_fi_class(self, fi, block): metadata_name_list = block.get(DataProvider.Metadata, []) metadata_name_list_selected = self._metadata_user_list.lookup( DataProvider.get_block_id(block)) metadata_idx_list = lmap( lambda metadata_name: safe_index(metadata_name_list, metadata_name ), metadata_name_list_selected) def _query_metadata(idx): if (idx is not None) and (idx < len(fi[DataProvider.Metadata])): return fi[DataProvider.Metadata][idx] return '' return tuple(imap(_query_metadata, metadata_idx_list))
def divide_blocks(self, block_iter): for block in block_iter: fi_idx_start = 0 files_per_job = self._files_per_job.lookup( DataProvider.get_block_id(block)) if files_per_job <= 0: raise PartitionError('Invalid number of files per job: %d' % files_per_job) while fi_idx_start < len(block[DataProvider.FileList]): fi_list = block[ DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job] fi_idx_start += files_per_job if fi_list: yield self._create_sub_block(block, fi_list)
def divide_blocks(self, block_iter): for block in block_iter: (entries, fi_list) = (0, []) entries_per_job = self._entries_per_job.lookup( DataProvider.get_block_id(block)) if entries_per_job <= 0: raise PartitionError('Invalid number of entries per job: %d' % entries_per_job) for fi in block[DataProvider.FileList]: if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job): yield self._create_sub_block(block, fi_list) (entries, fi_list) = (0, []) fi_list.append(fi) entries += fi[DataProvider.NEntries] if fi_list: yield self._create_sub_block(block, fi_list)
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list(self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def _filter_block(block): if self._filter: return self._filter in '/%s#' % DataProvider.get_block_id(block) return True
def _filter_block(block): if self._filter: return self._filter in '/%s#' % DataProvider.get_block_id( block) return True