def recover(ctx): ret = True stats = ctx.monitor.stats['recover'] stats.timer('recover', 'started') it = iterate_key(ctx.merged_filename, ctx.groups) node = elliptics_create_node(address=ctx.address, elog=ctx.elog, wait_timeout=ctx.wait_timeout, net_thread_num=4, io_thread_num=1, remotes=ctx.remotes) for batch_id, batch in groupby(enumerate(it), key=lambda x: x[0] / ctx.batch_size): recovers = [] rs = RecoverStat() for _, val in batch: rec = KeyRecover(ctx, *val, node=node) recovers.append(rec) for r in recovers: r.wait() ret &= r.succeeded() rs += r.stats rs.apply(stats) stats.timer('recover', 'finished') return ret
def __init__(self, ctx, key, key_infos, missed_groups, node, callback): self.ctx = ctx self.complete = threading.Event() self.callback = callback self.stats = RecoverStat() self.stats_cmd = ctx.stats['commands'] self.key = key self.key_flags = 0 self.key_infos = key_infos self.diff_groups = [] self.missed_groups = list(missed_groups) self.read_session = elliptics.newapi.Session(node) self.read_session.trace_id = ctx.trace_id self.read_session.set_filter(elliptics.filters.all) self.write_session = elliptics.newapi.Session(node) self.write_session.trace_id = ctx.trace_id self.write_session.set_checker(elliptics.checkers.all) self.write_session.ioflags |= elliptics.io_flags.cas_timestamp self.remove_session = elliptics.newapi.Session(node) self.remove_session.trace_id = ctx.trace_id self.remove_session.set_filter(elliptics.filters.all_final) self.remove_session.ioflags |= elliptics.io_flags.cas_timestamp self.remove_session.timestamp = ctx.prepare_timeout self.result = False self.attempt = 0 self.pending_operations = 1 self.pending_operations_lock = threading.Lock() log.debug("Recovering key: {0} from nonempty groups: {1} and missed groups: {2}" .format(repr(self.key), [k.group_id for k in self.key_infos], self.missed_groups)) self.run()
def __init__(self, ctx, key, key_infos, missed_groups, node, callback): self.ctx = ctx self.complete = threading.Event() self.callback = callback self.stats = RecoverStat() self.key = key self.key_flags = 0 self.key_infos = key_infos self.diff_groups = [] self.missed_groups = list(missed_groups) self.read_session = elliptics.Session(node) self.read_session.trace_id = ctx.trace_id self.read_session.set_filter(elliptics.filters.all) self.write_session = elliptics.Session(node) self.write_session.trace_id = ctx.trace_id self.write_session.set_checker(elliptics.checkers.all) self.remove_session = elliptics.Session(node) self.remove_session.trace_id = ctx.trace_id self.remove_session.set_checker(elliptics.checkers.all) self.result = False self.attempt = 0 log.debug( "Recovering key: {0} from nonempty groups: {1} and missed groups: {2}" .format(repr(self.key), [k.group_id for k in self.key_infos], self.missed_groups)) self.run()
def recover(ctx): from itertools import islice import time ret = True stats = ctx.stats['recover'] stats.timer('recover', 'started') it = iterate_key(ctx.merged_filename, ctx.groups) elog = elliptics.Logger(ctx.log_file, int(ctx.log_level)) node = elliptics_create_node(address=ctx.address, elog=elog, wait_timeout=ctx.wait_timeout, net_thread_num=4, io_thread_num=1, remotes=ctx.remotes) processed_keys = 0 start = time.time() while 1: batch = tuple(islice(it, ctx.batch_size)) if not batch: break recovers = [] rs = RecoverStat() for val in batch: rec = KeyRecover(ctx, *val, node=node) recovers.append(rec) successes, failures = 0, 0 for r in recovers: r.wait() ret &= r.succeeded() rs += r.stats if r.succeeded(): successes += 1 else: failures += 1 processed_keys += successes + failures rs.apply(stats) stats.counter('recovered_keys', successes) ctx.stats.counter('recovered_keys', successes) stats.counter('recovered_keys', -failures) ctx.stats.counter('recovered_keys', -failures) stats.set_counter('recovery_speed', processed_keys / (time.time() - start)) stats.timer('recover', 'finished') return ret
class KeyRecover(object): def __init__(self, ctx, key, key_infos, missed_groups, node, callback): self.ctx = ctx self.complete = threading.Event() self.callback = callback self.stats = RecoverStat() self.stats_cmd = ctx.stats['commands'] self.key = key self.key_flags = 0 self.key_infos = key_infos self.diff_groups = [] self.missed_groups = list(missed_groups) self.read_session = elliptics.newapi.Session(node) self.read_session.trace_id = ctx.trace_id self.read_session.set_filter(elliptics.filters.all) self.write_session = elliptics.newapi.Session(node) self.write_session.trace_id = ctx.trace_id self.write_session.set_checker(elliptics.checkers.all) self.write_session.ioflags |= elliptics.io_flags.cas_timestamp self.remove_session = elliptics.newapi.Session(node) self.remove_session.trace_id = ctx.trace_id self.remove_session.set_filter(elliptics.filters.all_final) self.remove_session.ioflags |= elliptics.io_flags.cas_timestamp self.remove_session.timestamp = ctx.prepare_timeout self.result = False self.attempt = 0 self.pending_operations = 1 self.pending_operations_lock = threading.Lock() log.debug("Recovering key: {0} from nonempty groups: {1} and missed groups: {2}" .format(repr(self.key), [k.group_id for k in self.key_infos], self.missed_groups)) self.run() def run(self): self.total_size = self.key_infos[0].size self.chunked = self.total_size > self.ctx.chunk_size self.recovered_size = 0 same_meta = lambda lhs, rhs: (lhs.timestamp, lhs.size, lhs.user_flags) == (rhs.timestamp, rhs.size, rhs.user_flags) same_infos = [info for info in self.key_infos if same_meta(info, self.key_infos[0])] self.key_flags = same_infos[0].flags self.same_groups = [info.group_id for info in same_infos] self.key_infos = [info for info in self.key_infos if info.group_id not in self.same_groups] self.diff_groups += [info.group_id for info in self.key_infos] self.diff_groups = list(set(self.diff_groups).difference(self.same_groups)) if not self.diff_groups and not self.missed_groups: log.debug("Key: {0} already up-to-date in all groups: {1}".format(self.key, self.same_groups)) self.stop(False) return log.debug("Try to recover key: {0} from groups: {1} to groups: {2}: diff groups: {3}, missed groups: {4}" .format(self.key, self.same_groups, self.diff_groups + self.missed_groups, self.diff_groups, self.missed_groups)) self.read_session.groups = self.same_groups groups_for_write = [] for group in self.diff_groups + self.missed_groups: if group in self.ctx.ro_groups: self.stats.counter('skip_write_to_ro_group', 1) continue groups_for_write.append(group) self.write_session.groups = groups_for_write self.read() def stop(self, result): self.result = result log.debug("Finished recovering key: {0} with result: {1}".format(self.key, self.result)) self.on_complete() def on_complete(self): with self.pending_operations_lock: self.pending_operations -= 1 if self.pending_operations: return # Remove all sessions to release links to elliptics node. Main thread must hold last reference to # elliptics node after recovery completion. Otherwise, node may be destroyed in i/o thread. del self.read_session, self.write_session, self.remove_session self.complete.set() self.callback(self.result, self.stats) def read(self): try: size = self.total_size log.debug("Reading key: {0} from groups: {1}, chunked: {2}" .format(self.key, self.read_session.groups, self.chunked)) if self.chunked: size = min(self.total_size - self.recovered_size, self.ctx.chunk_size) # do not check checksum for all but the first chunk if self.recovered_size != 0: if self.key_flags & elliptics.record_flags.chunked_csum: # if record was checksummed by chunks there is no need to disable checksum verification self.read_session.ioflags &= ~elliptics.io_flags.nocsum else: self.read_session.ioflags |= elliptics.io_flags.nocsum else: size = min(self.total_size, size) self.read_session.ioflags = 0 log.debug("Reading key: {0} from groups: {1}, chunked: {2}, offset: {3}, size: {4}, total_size: {5}" .format(self.key, self.read_session.groups, self.chunked, self.recovered_size, size, self.total_size)) if self.recovered_size == 0: # read first chunk of data with json read_result = self.read_session.read(key=self.key, offset=0, size=size) else: # read none-first chunks of data without json read_result = self.read_session.read_data(key=self.key, offset=self.recovered_size, size=size) read_result.connect(self.onread) except Exception as e: log.error("Read key: {0} by offset: {1} and size: {2} raised exception: {3}, traceback: {4}" .format(self.key, self.recovered_size, size, repr(e), traceback.format_exc())) self.stop(False) def write(self): try: log.debug('Writing key: %s to groups: %s', repr(self.key), self.diff_groups + self.missed_groups) params = {'key': self.key, 'json': self.json, 'data': self.write_data, 'data_offset': self.recovered_size} if self.chunked: if self.recovered_size == 0: params['json_capacity'] = self.json_capacity params['data_capacity'] = self.total_size log.debug('Write_prepare key: %s to groups: %s, json: %d, json_capacity: %d, ' 'data: %d, data_offset: %d, data_capacity: %d', params['key'], self.write_session.groups, len(params['json']), params['json_capacity'], len(params['data']), params['data_offset'], params['data_capacity']) write_result = self.write_session.write_prepare(**params) elif self.recovered_size + len(params['data']) < self.total_size: log.debug('Write_plain key: %s to groups: %s, json: %d, data: %d, data_offset: %d', params['key'], self.write_session.groups, len(params['json']), len(params['data']), params['data_offset']) write_result = self.write_session.write_plain(**params) else: params['data_commit_size'] = self.total_size log.debug('Write_commit key: %s to groups: %s, json: %d, data: %d, data_offset: %d' 'data_commit_size: %d', params['key'], self.write_session.groups, len(params['json']), len(params['data']), params['data_offset'], params['data_commit_size']) write_result = self.write_session.write_commit(**params) else: del params['data_offset'] params['json_capacity'] = self.json_capacity params['data_capacity'] = self.total_size log.debug('Write key: %s to groups: %s, json: %d, json_capacity: %d, ' 'data: %d, data_capacity: %d', params['key'], self.write_session.groups, len(params['json']), params['json_capacity'], len(params['data']), params['data_capacity']) write_result = self.write_session.write(**params) write_result.connect(self.onwrite) except Exception as e: log.error('Writing key: %s raised exception: %s, traceback: %s', self.key, repr(e), traceback.format_exc()) self.stop(False) def onread(self, results, error): try: corrupted_groups = [] for result in results: if result.status != -errno.EILSEQ: continue if result.group_id in self.ctx.ro_groups: self.stats.counter('skip_remove_corrupted_key_from_ro_group', 1) continue corrupted_groups.append(result.group_id) if corrupted_groups: with self.pending_operations_lock: self.pending_operations += 1 KeyRemover(self.key, self.total_size, self.remove_session, corrupted_groups, self.ctx, self.stats, self.on_complete).remove() if error.code: log.error("Failed to read key: {0} from groups: {1}: {2}".format(self.key, self.same_groups, error)) self.stats_cmd.counter('read.{0}'.format(error.code), 1) self.stats.read_failed += len(results) if error.code == -errno.ETIMEDOUT: if self.attempt < self.ctx.attempts: self.attempt += 1 old_timeout = self.read_session.timeout self.read_session.timeout *= 2 log.error("Read has been timed out. Try to reread key: {0} from groups: {1}, attempt: {2}/{3} " "with increased timeout: {4}/{5}" .format(self.key, self.same_groups, self.attempt, self.ctx.attempts, self.read_session.timeout, old_timeout)) self.read() else: log.error("Read has been timed out {0} times, all {1} attempts are used. " "The key: {1} can't be recovery now. Skip it" .format(self.attempt, self.key)) self.stats.skipped += 1 self.stop(False) elif len(self.key_infos) > 1: log.error("Key: {0} has available replicas in other groups. Try to recover the key from them" .format(self.key)) self.diff_groups += self.read_session.groups self.run() else: log.error("Failed to read key: {0} from any available group. " "This key can't be recovered now. Skip it" .format(self.key)) self.stats.skipped += 1 self.stop(False) return self.stats.read_failed += len(results) - 1 self.stats.read += 1 self.stats.read_bytes += results[-1].size if self.recovered_size == 0: self.write_session.user_flags = results[-1].record_info.user_flags self.write_session.timestamp = results[-1].record_info.data_timestamp self.write_session.json_timestamp = results[-1].record_info.json_timestamp self.read_session.ioflags |= elliptics.io_flags.nocsum self.read_session.groups = [results[-1].group_id] self.key_flags = results[-1].record_info.record_flags self.json_capacity = results[-1].record_info.json_capacity if self.total_size != results[-1].record_info.data_size: self.total_size = results[-1].record_info.data_size self.chunked = self.total_size > self.ctx.chunk_size self.attempt = 0 if self.chunked and len(results) > 1: self.missed_groups += [r.group_id for r in results if r.error.code] log.debug("Copy key: %s from groups: %s to groups: %s", repr(self.key), self.same_groups, self.write_session.groups) self.write_data = results[-1].data self.json = results[-1].json self.write() except Exception as e: log.error("Failed to handle origin key: {0}, exception: {1}, traceback: {2}" .format(self.key, repr(e), traceback.format_exc())) self.stop(False) def onwrite(self, results, error): try: if error.code: self.stats_cmd.counter('write.{0}'.format(error.code), 1) self.stats.write_failed += 1 log.error("Failed to write key: {0}: to groups: {1}: {2}" .format(self.key, self.write_session.groups, error)) if self.attempt < self.ctx.attempts and error.code in (-errno.ETIMEDOUT, -errno.ENXIO): old_timeout = self.write_session.timeout self.write_session.timeout *= 2 self.attempt += 1 log.info("Retry to write key: {0} attempts: {1}/{2} increased timeout: {3}/{4}" .format(repr(self.key), self.attempt, self.ctx.attempts, self.write_session.timeout, old_timeout)) self.stats.write_retries += 1 self.write() return self.stop(False) return self.stats.write += len(results) self.recovered_size += len(self.write_data) self.stats.written_bytes += len(self.write_data) * len(results) self.attempt = 0 if self.recovered_size < self.total_size: self.read() else: log.debug("Key: {0} has been successfully copied to groups: {1}" .format(repr(self.key), [r.group_id for r in results])) self.stop(True) except Exception as e: log.error("Failed to handle write result key: {0}: {1}, traceback: {2}" .format(self.key, repr(e), traceback.format_exc())) self.stop(False) def wait(self): if not self.complete.is_set(): self.complete.wait() def succeeded(self): self.wait() return self.result