def roundrobin_datadirs(datadirs): """ Generator to walk the data dirs in a round robin manner, evenly hitting each device on the system, and yielding any .db files found (in their proper places). The partitions within each data dir are walked randomly, however. :param datadirs: a list of tuples of (path, context, partition_filter) to walk. The context may be any object; the context is not used by this function but is included with each yielded tuple. :returns: A generator of (partition, path_to_db_file, context) """ def walk_datadir(datadir, context, part_filter): partitions = [pd for pd in os.listdir(datadir) if looks_like_partition(pd) and part_filter(pd)] random.shuffle(partitions) for partition in partitions: part_dir = os.path.join(datadir, partition) if not os.path.isdir(part_dir): continue suffixes = os.listdir(part_dir) if not suffixes: os.rmdir(part_dir) continue for suffix in suffixes: suff_dir = os.path.join(part_dir, suffix) if not os.path.isdir(suff_dir): continue hashes = os.listdir(suff_dir) if not hashes: os.rmdir(suff_dir) continue for hsh in hashes: hash_dir = os.path.join(suff_dir, hsh) if not os.path.isdir(hash_dir): continue object_file = os.path.join(hash_dir, hsh + '.db') # common case if os.path.exists(object_file): yield (partition, object_file, context) continue # look for any alternate db filenames db_files = get_db_files(object_file) if db_files: yield (partition, db_files[-1], context) continue try: os.rmdir(hash_dir) except OSError as e: if e.errno != errno.ENOTEMPTY: raise its = [walk_datadir(datadir, context, filt) for datadir, context, filt in datadirs] rr_its = round_robin_iter(its) for datadir in rr_its: yield datadir
def roundrobin_datadirs(datadirs): """ Generator to walk the data dirs in a round robin manner, evenly hitting each device on the system, and yielding any .db files found (in their proper places). The partitions within each data dir are walked randomly, however. :param datadirs: a list of (path, node_id, partition_filter) to walk :returns: A generator of (partition, path_to_db_file, node_id) """ def walk_datadir(datadir, node_id, part_filter): partitions = [ pd for pd in os.listdir(datadir) if looks_like_partition(pd) and part_filter(pd) ] random.shuffle(partitions) for partition in partitions: part_dir = os.path.join(datadir, partition) if not os.path.isdir(part_dir): continue suffixes = os.listdir(part_dir) if not suffixes: os.rmdir(part_dir) continue for suffix in suffixes: suff_dir = os.path.join(part_dir, suffix) if not os.path.isdir(suff_dir): continue hashes = os.listdir(suff_dir) if not hashes: os.rmdir(suff_dir) continue for hsh in hashes: hash_dir = os.path.join(suff_dir, hsh) if not os.path.isdir(hash_dir): continue object_file = os.path.join(hash_dir, hsh + '.db') if os.path.exists(object_file): yield (partition, object_file, node_id) else: try: os.rmdir(hash_dir) except OSError as e: if e.errno != errno.ENOTEMPTY: raise its = [ walk_datadir(datadir, node_id, filt) for datadir, node_id, filt in datadirs ] rr_its = round_robin_iter(its) for datadir in rr_its: yield datadir
def roundrobin_datadirs(datadirs): """ Generator to walk the data dirs in a round robin manner, evenly hitting each device on the system, and yielding any .db files found (in their proper places). The partitions within each data dir are walked randomly, however. :param datadirs: a list of (path, node_id, partition_filter) to walk :returns: A generator of (partition, path_to_db_file, node_id) """ def walk_datadir(datadir, node_id, part_filter): partitions = [pd for pd in os.listdir(datadir) if looks_like_partition(pd) and part_filter(pd)] random.shuffle(partitions) for partition in partitions: part_dir = os.path.join(datadir, partition) if not os.path.isdir(part_dir): continue suffixes = os.listdir(part_dir) if not suffixes: os.rmdir(part_dir) continue for suffix in suffixes: suff_dir = os.path.join(part_dir, suffix) if not os.path.isdir(suff_dir): continue hashes = os.listdir(suff_dir) if not hashes: os.rmdir(suff_dir) continue for hsh in hashes: hash_dir = os.path.join(suff_dir, hsh) if not os.path.isdir(hash_dir): continue object_file = os.path.join(hash_dir, hsh + '.db') if os.path.exists(object_file): yield (partition, object_file, node_id) else: try: os.rmdir(hash_dir) except OSError as e: if e.errno != errno.ENOTEMPTY: raise its = [walk_datadir(datadir, node_id, filt) for datadir, node_id, filt in datadirs] rr_its = round_robin_iter(its) for datadir in rr_its: yield datadir
def audit_all_objects(self, mode='once', device_dirs=None): description = '' if device_dirs: device_dir_str = ','.join(sorted(device_dirs)) if self.auditor_type == 'ALL': description = _(' - parallel, %s') % device_dir_str else: description = _(' - %s') % device_dir_str self.logger.info( _('Begin object audit "%(mode)s" mode (%(audi_type)s' '%(description)s)') % { 'mode': mode, 'audi_type': self.auditor_type, 'description': description }) begin = reported = time.time() self.total_bytes_processed = 0 self.total_files_processed = 0 total_quarantines = 0 total_errors = 0 time_auditing = 0 # get AuditLocations for each policy loc_generators = [] for policy in POLICIES: loc_generators.append( self.diskfile_router[policy].object_audit_location_generator( policy, device_dirs=device_dirs, auditor_type=self.auditor_type)) all_locs = round_robin_iter(loc_generators) for location in all_locs: loop_time = time.time() self.failsafe_object_audit(location) self.logger.timing_since('timing', loop_time) self.files_running_time = ratelimit_sleep( self.files_running_time, self.max_files_per_second) self.total_files_processed += 1 now = time.time() if now - self.last_logged >= self.log_time: self.logger.info( _('Object audit (%(type)s). ' 'Since %(start_time)s: Locally: %(passes)d passed, ' '%(quars)d quarantined, %(errors)d errors, ' 'files/sec: %(frate).2f, bytes/sec: %(brate).2f, ' 'Total time: %(total).2f, Auditing time: %(audit).2f, ' 'Rate: %(audit_rate).2f') % { 'type': '%s%s' % (self.auditor_type, description), 'start_time': time.ctime(reported), 'passes': self.passes, 'quars': self.quarantines, 'errors': self.errors, 'frate': self.passes / (now - reported), 'brate': self.bytes_processed / (now - reported), 'total': (now - begin), 'audit': time_auditing, 'audit_rate': time_auditing / (now - begin) }) cache_entry = self.create_recon_nested_dict( 'object_auditor_stats_%s' % (self.auditor_type), device_dirs, { 'errors': self.errors, 'passes': self.passes, 'quarantined': self.quarantines, 'bytes_processed': self.bytes_processed, 'start_time': reported, 'audit_time': time_auditing }) dump_recon_cache(cache_entry, self.rcache, self.logger) reported = now total_quarantines += self.quarantines total_errors += self.errors self.passes = 0 self.quarantines = 0 self.errors = 0 self.bytes_processed = 0 self.last_logged = now time_auditing += (now - loop_time) # Avoid divide by zero during very short runs elapsed = (time.time() - begin) or 0.000001 self.logger.info( _('Object audit (%(type)s) "%(mode)s" mode ' 'completed: %(elapsed).02fs. Total quarantined: %(quars)d, ' 'Total errors: %(errors)d, Total files/sec: %(frate).2f, ' 'Total bytes/sec: %(brate).2f, Auditing time: %(audit).2f, ' 'Rate: %(audit_rate).2f') % { 'type': '%s%s' % (self.auditor_type, description), 'mode': mode, 'elapsed': elapsed, 'quars': total_quarantines + self.quarantines, 'errors': total_errors + self.errors, 'frate': self.total_files_processed / elapsed, 'brate': self.total_bytes_processed / elapsed, 'audit': time_auditing, 'audit_rate': time_auditing / elapsed }) if self.stats_sizes: self.logger.info( _('Object audit stats: %s') % json.dumps(self.stats_buckets)) for policy in POLICIES: # Unset remaining partitions to not skip them in the next run self.diskfile_router[policy].clear_auditor_status( policy, self.auditor_type)
def audit_all_objects(self, mode='once', device_dirs=None): description = '' if device_dirs: device_dir_str = ','.join(sorted(device_dirs)) if self.auditor_type == 'ALL': description = _(' - parallel, %s') % device_dir_str else: description = _(' - %s') % device_dir_str self.logger.info(_('Begin object audit "%(mode)s" mode (%(audi_type)s' '%(description)s)') % {'mode': mode, 'audi_type': self.auditor_type, 'description': description}) begin = reported = time.time() self.total_bytes_processed = 0 self.total_files_processed = 0 total_quarantines = 0 total_errors = 0 time_auditing = 0 # get AuditLocations for each policy loc_generators = [] for policy in POLICIES: loc_generators.append( self.diskfile_router[policy] .object_audit_location_generator( policy, device_dirs=device_dirs, auditor_type=self.auditor_type)) all_locs = round_robin_iter(loc_generators) for location in all_locs: loop_time = time.time() self.failsafe_object_audit(location) self.logger.timing_since('timing', loop_time) self.files_running_time = ratelimit_sleep( self.files_running_time, self.max_files_per_second) self.total_files_processed += 1 now = time.time() if now - self.last_logged >= self.log_time: self.logger.info(_( 'Object audit (%(type)s). ' 'Since %(start_time)s: Locally: %(passes)d passed, ' '%(quars)d quarantined, %(errors)d errors, ' 'files/sec: %(frate).2f, bytes/sec: %(brate).2f, ' 'Total time: %(total).2f, Auditing time: %(audit).2f, ' 'Rate: %(audit_rate).2f') % { 'type': '%s%s' % (self.auditor_type, description), 'start_time': time.ctime(reported), 'passes': self.passes, 'quars': self.quarantines, 'errors': self.errors, 'frate': self.passes / (now - reported), 'brate': self.bytes_processed / (now - reported), 'total': (now - begin), 'audit': time_auditing, 'audit_rate': time_auditing / (now - begin)}) cache_entry = self.create_recon_nested_dict( 'object_auditor_stats_%s' % (self.auditor_type), device_dirs, {'errors': self.errors, 'passes': self.passes, 'quarantined': self.quarantines, 'bytes_processed': self.bytes_processed, 'start_time': reported, 'audit_time': time_auditing}) dump_recon_cache(cache_entry, self.rcache, self.logger) reported = now total_quarantines += self.quarantines total_errors += self.errors self.passes = 0 self.quarantines = 0 self.errors = 0 self.bytes_processed = 0 self.last_logged = now time_auditing += (now - loop_time) # Avoid divide by zero during very short runs elapsed = (time.time() - begin) or 0.000001 self.logger.info(_( 'Object audit (%(type)s) "%(mode)s" mode ' 'completed: %(elapsed).02fs. Total quarantined: %(quars)d, ' 'Total errors: %(errors)d, Total files/sec: %(frate).2f, ' 'Total bytes/sec: %(brate).2f, Auditing time: %(audit).2f, ' 'Rate: %(audit_rate).2f') % { 'type': '%s%s' % (self.auditor_type, description), 'mode': mode, 'elapsed': elapsed, 'quars': total_quarantines + self.quarantines, 'errors': total_errors + self.errors, 'frate': self.total_files_processed / elapsed, 'brate': self.total_bytes_processed / elapsed, 'audit': time_auditing, 'audit_rate': time_auditing / elapsed}) if self.stats_sizes: self.logger.info( _('Object audit stats: %s') % json.dumps(self.stats_buckets)) for policy in POLICIES: # Unset remaining partitions to not skip them in the next run self.diskfile_router[policy].clear_auditor_status( policy, self.auditor_type)