예제 #1
0
	def split_partitions(self, block_iter, entry_first=0):
		for block in block_iter:
			entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block))
			for proto_partition in self._partition_block(block[DataProvider.FileList],
					entries_per_job, entry_first):
				entry_first = 0
				yield self._finish_partition(block, proto_partition)
예제 #2
0
    def process_block(self, block):
        # Check uniqueness of URLs
        url_hash_list = []
        if self._check_url != DatasetUniqueMode.ignore:
            block[DataProvider.FileList] = list(
                self._process_fi_list(url_hash_list,
                                      block[DataProvider.FileList]))
            url_hash_list.sort()

        # Check uniqueness of blocks
        if self._check_block != DatasetUniqueMode.ignore:
            block_hash = md5_hex(
                repr((block.get(DataProvider.Dataset),
                      block[DataProvider.BlockName], url_hash_list,
                      block[DataProvider.NEntries],
                      block[DataProvider.Locations],
                      block.get(DataProvider.Metadata))))
            if block_hash in self._recorded_block:
                msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(
                    block)
                msg += ' (This check can be configured with %r)' % 'dataset check unique block'
                if self._check_block == DatasetUniqueMode.warn:
                    self._log.warning(msg)
                elif self._check_block == DatasetUniqueMode.abort:
                    raise DatasetError(msg)
                elif self._check_block == DatasetUniqueMode.skip:
                    return None
            self._recorded_block.add(block_hash)
        return block
예제 #3
0
	def process_block(self, block):
		# Check entry consistency
		events = sum(imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList]))
		if block.setdefault(DataProvider.NEntries, events) != events:
			error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)'
			error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events)
			self._handle_error(error_msg, self._mode)
		return block
예제 #4
0
 def split_partitions(self, block_iter, entry_first=0):
     for block in block_iter:
         entries_per_job = self._entries_per_job.lookup(
             DataProvider.get_block_id(block))
         for proto_partition in self._partition_block(
                 block[DataProvider.FileList], entries_per_job,
                 entry_first):
             entry_first = 0
             yield self._finish_partition(block, proto_partition)
예제 #5
0
	def _get_fi_class(self, fi, block):
		metadata_name_list = block.get(DataProvider.Metadata, [])
		metadata_name_list_selected = self._metadata_user_list.lookup(DataProvider.get_block_id(block))
		metadata_idx_list = lmap(lambda metadata_name: safe_index(metadata_name_list, metadata_name),
			metadata_name_list_selected)

		def _query_metadata(idx):
			if (idx is not None) and (idx < len(fi[DataProvider.Metadata])):
				return fi[DataProvider.Metadata][idx]
			return ''
		return tuple(imap(_query_metadata, metadata_idx_list))
예제 #6
0
	def divide_blocks(self, block_iter):
		for block in block_iter:
			fi_idx_start = 0
			files_per_job = self._files_per_job.lookup(DataProvider.get_block_id(block))
			if files_per_job <= 0:
				raise PartitionError('Invalid number of files per job: %d' % files_per_job)
			while fi_idx_start < len(block[DataProvider.FileList]):
				fi_list = block[DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job]
				fi_idx_start += files_per_job
				if fi_list:
					yield self._create_sub_block(block, fi_list)
예제 #7
0
 def process_block(self, block):
     # Check entry consistency
     events = sum(
         imap(itemgetter(DataProvider.NEntries),
              block[DataProvider.FileList]))
     if block.setdefault(DataProvider.NEntries, events) != events:
         error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)'
         error_msg = error_msg % (DataProvider.get_block_id(block),
                                  block[DataProvider.NEntries], events)
         self._handle_error(error_msg, self._mode)
     return block
예제 #8
0
	def divide_blocks(self, block_iter):
		for block in block_iter:
			(entries, fi_list) = (0, [])
			entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block))
			if entries_per_job <= 0:
				raise PartitionError('Invalid number of entries per job: %d' % entries_per_job)
			for fi in block[DataProvider.FileList]:
				if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job):
					yield self._create_sub_block(block, fi_list)
					(entries, fi_list) = (0, [])
				fi_list.append(fi)
				entries += fi[DataProvider.NEntries]
			if fi_list:
				yield self._create_sub_block(block, fi_list)
예제 #9
0
 def process_block(self, block):
     if block[DataProvider.Locations] is not None:
         sites = self._location_filter.filter_list(
             block[DataProvider.Locations])
         if (sites is not None) and (len(sites) == 0) and (len(
                 block[DataProvider.FileList]) != 0):
             error_msg = 'Block %s is not available ' % DataProvider.get_block_id(
                 block)
             if not len(block[DataProvider.Locations]):
                 self._log.warning(error_msg + 'at any site!')
             elif not len(sites):
                 self._log.warning(error_msg + 'at any selected site!')
         block[DataProvider.Locations] = sites
     return block
예제 #10
0
    def _get_fi_class(self, fi, block):
        metadata_name_list = block.get(DataProvider.Metadata, [])
        metadata_name_list_selected = self._metadata_user_list.lookup(
            DataProvider.get_block_id(block))
        metadata_idx_list = lmap(
            lambda metadata_name: safe_index(metadata_name_list, metadata_name
                                             ), metadata_name_list_selected)

        def _query_metadata(idx):
            if (idx is not None) and (idx < len(fi[DataProvider.Metadata])):
                return fi[DataProvider.Metadata][idx]
            return ''

        return tuple(imap(_query_metadata, metadata_idx_list))
예제 #11
0
 def divide_blocks(self, block_iter):
     for block in block_iter:
         fi_idx_start = 0
         files_per_job = self._files_per_job.lookup(
             DataProvider.get_block_id(block))
         if files_per_job <= 0:
             raise PartitionError('Invalid number of files per job: %d' %
                                  files_per_job)
         while fi_idx_start < len(block[DataProvider.FileList]):
             fi_list = block[
                 DataProvider.FileList][fi_idx_start:fi_idx_start +
                                        files_per_job]
             fi_idx_start += files_per_job
             if fi_list:
                 yield self._create_sub_block(block, fi_list)
예제 #12
0
 def divide_blocks(self, block_iter):
     for block in block_iter:
         (entries, fi_list) = (0, [])
         entries_per_job = self._entries_per_job.lookup(
             DataProvider.get_block_id(block))
         if entries_per_job <= 0:
             raise PartitionError('Invalid number of entries per job: %d' %
                                  entries_per_job)
         for fi in block[DataProvider.FileList]:
             if fi_list and (entries + fi[DataProvider.NEntries] >
                             entries_per_job):
                 yield self._create_sub_block(block, fi_list)
                 (entries, fi_list) = (0, [])
             fi_list.append(fi)
             entries += fi[DataProvider.NEntries]
         if fi_list:
             yield self._create_sub_block(block, fi_list)
예제 #13
0
	def process_block(self, block):
		# Check uniqueness of URLs
		url_hash_list = []
		if self._check_url != DatasetUniqueMode.ignore:
			block[DataProvider.FileList] = list(self._process_fi_list(url_hash_list,
				block[DataProvider.FileList]))
			url_hash_list.sort()

		# Check uniqueness of blocks
		if self._check_block != DatasetUniqueMode.ignore:
			block_hash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName],
				url_hash_list, block[DataProvider.NEntries],
				block[DataProvider.Locations], block.get(DataProvider.Metadata))))
			if block_hash in self._recorded_block:
				msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(block)
				msg += ' (This check can be configured with %r)' % 'dataset check unique block'
				if self._check_block == DatasetUniqueMode.warn:
					self._log.warning(msg)
				elif self._check_block == DatasetUniqueMode.abort:
					raise DatasetError(msg)
				elif self._check_block == DatasetUniqueMode.skip:
					return None
			self._recorded_block.add(block_hash)
		return block
예제 #14
0
		def _filter_block(block):
			if self._filter:
				return self._filter in '/%s#' % DataProvider.get_block_id(block)
			return True
예제 #15
0
 def _filter_block(block):
     if self._filter:
         return self._filter in '/%s#' % DataProvider.get_block_id(
             block)
     return True