def task(ctx, config): if config is None: config = {} assert isinstance(config, dict), \ 'mon_clock_skew_check task only accepts a dict for configuration' interval = float(config.get('interval', 30.0)) expect_skew = config.get('expect-skew', False) log.info('Beginning mon_clock_skew_check...') first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.keys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) quorum_size = len(teuthology.get_mon_names(ctx)) manager.wait_for_mon_quorum_size(quorum_size) # wait a bit log.info('sleeping for {s} seconds'.format(s=interval)) time.sleep(interval) health = manager.get_mon_health(True) log.info('got health %s' % health) if expect_skew: if 'MON_CLOCK_SKEW' not in health['checks']: raise RuntimeError('expected MON_CLOCK_SKEW but got none') else: if 'MON_CLOCK_SKEW' in health['checks']: raise RuntimeError('got MON_CLOCK_SKEW but expected none')
def test_get_mon_names(): expected = [ ([['mon.a', 'osd.0', 'mon.c']], 'ceph', ['mon.a', 'mon.c']), ([['ceph.mon.a', 'osd.0', 'ceph.mon.c']], 'ceph', ['ceph.mon.a', 'ceph.mon.c']), ([['mon.a', 'osd.0', 'mon.c'], ['ceph.mon.b']], 'ceph', ['mon.a', 'mon.c', 'ceph.mon.b']), ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'ceph', ['mon.a', 'mon.c']), ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'foo', ['foo.mon.a']), ] for remote_roles, cluster_name, expected_mons in expected: ctx = argparse.Namespace() ctx.cluster = Mock() ctx.cluster.remotes = {i: roles for i, roles in enumerate(remote_roles)} mons = misc.get_mon_names(ctx, cluster_name) assert expected_mons == mons
def _get_next_port(ctx, ip, cluster): # assuming we have only one cluster here. used = [] for name in teuthology.get_mon_names(ctx, cluster): addr = ctx.ceph[cluster].conf[name]['mon addr'] mon_ip, mon_port = addr.split(':') if mon_ip != ip: continue used.append(int(mon_port)) port = 6789 used.sort() for p in used: if p != port: break port += 1 return port
def _get_mons(ctx): """ Get monitor names from the context value. """ mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)] return mons
def _get_mons(ctx): mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)] return mons
def task(ctx, config): """ Test monitor recovery. """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.keys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)] log.info("mon ids = %s" % mons) manager.wait_for_mon_quorum_size(len(mons)) log.info('verifying all monitors are in the quorum') for m in mons: s = manager.get_mon_status(m) assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons) log.info('restarting each monitor in turn') for m in mons: # stop a monitor manager.kill_mon(m) manager.wait_for_mon_quorum_size(len(mons) - 1) # restart manager.revive_mon(m) manager.wait_for_mon_quorum_size(len(mons)) # in forward and reverse order, rmons = mons rmons.reverse() for mons in mons, rmons: log.info('stopping all monitors') for m in mons: manager.kill_mon(m) log.info('forming a minimal quorum for %s, then adding monitors' % mons) qnum = (len(mons) // 2) + 1 num = 0 for m in mons: manager.revive_mon(m) num += 1 if num >= qnum: manager.wait_for_mon_quorum_size(num) # on both leader and non-leader ranks... for rank in [0, 1]: # take one out log.info('removing mon %s' % mons[rank]) manager.kill_mon(mons[rank]) manager.wait_for_mon_quorum_size(len(mons) - 1) log.info('causing some monitor log activity') m = 30 for n in range(1, m): manager.raw_cluster_cmd('log', '%d of %d' % (n, m)) log.info('adding mon %s back in' % mons[rank]) manager.revive_mon(mons[rank]) manager.wait_for_mon_quorum_size(len(mons))
def do_check(self): self.info('start checking for clock skews') skews = dict() ran_once = False started_on = None while not self.stopping or (self.at_least_once and not ran_once): if self.at_least_once and not ran_once and self.stopping: if started_on is None: self.info('kicking-off timeout (if any)') started_on = time.time() elif self.at_least_once_timeout > 0.0: assert time.time() - started_on < self.at_least_once_timeout, \ 'failed to obtain a timecheck before timeout expired' quorum_size = len(teuthology.get_mon_names(self.ctx)) self.manager.wait_for_mon_quorum_size(quorum_size) health = self.manager.get_mon_health(True) timechecks = health['timechecks'] clean_check = False if timechecks['round_status'] == 'finished': assert (timechecks['round'] % 2) == 0, \ 'timecheck marked as finished but round ' \ 'disagrees (r {r})'.format( r=timechecks['round']) clean_check = True else: assert timechecks['round_status'] == 'on-going', \ 'timecheck status expected \'on-going\' ' \ 'but found \'{s}\' instead'.format( s=timechecks['round_status']) if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1: self.info('round still on-going, but there are available reports') else: self.info('no timechecks available just yet') self.sleep_interval() continue assert len(timechecks['mons']) > 1, \ 'there are not enough reported timechecks; ' \ 'expected > 1 found {n}'.format(n=len(timechecks['mons'])) for check in timechecks['mons']: mon_skew = float(check['skew']) mon_health = check['health'] mon_id = check['name'] if abs(mon_skew) > self.max_skew: assert mon_health == 'HEALTH_WARN', \ 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format( id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew) log_str = 'mon.{id} with skew {s} > max {ms}'.format( id=mon_id,s=abs(mon_skew),ms=self.max_skew) """ add to skew list """ details = check['details'] skews[mon_id] = {'skew': mon_skew, 'details': details} if self.expect_skew: self.info('expected skew: {str}'.format(str=log_str)) else: self.warn('unexpected skew: {str}'.format(str=log_str)) if clean_check or (self.expect_skew and len(skews) > 0): ran_once = True self.print_skews(skews) self.sleep_interval() total = len(skews) self.print_skews(skews) error_str = '' found_error = False if self.expect_skew: if total == 0: error_str = 'We were expecting a skew, but none was found!' found_error = True else: if total > 0: error_str = 'We were not expecting a skew, but we did find it!' found_error = True if found_error: self.info(error_str) if not self.never_fail: assert False, error_str
def _get_mons(ctx): return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
def task(ctx, config): """ Test monitor recovery. """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)] log.info("mon ids = %s" % mons) manager.wait_for_mon_quorum_size(len(mons)) log.info('verifying all monitors are in the quorum') for m in mons: s = manager.get_mon_status(m) assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons) log.info('restarting each monitor in turn') for m in mons: # stop a monitor manager.kill_mon(m) manager.wait_for_mon_quorum_size(len(mons) - 1) # restart manager.revive_mon(m) manager.wait_for_mon_quorum_size(len(mons)) # in forward and reverse order, rmons = mons rmons.reverse() for mons in mons, rmons: log.info('stopping all monitors') for m in mons: manager.kill_mon(m) log.info('forming a minimal quorum for %s, then adding monitors' % mons) qnum = (len(mons) / 2) + 1 num = 0 for m in mons: manager.revive_mon(m) num += 1 if num >= qnum: manager.wait_for_mon_quorum_size(num) # on both leader and non-leader ranks... for rank in [0, 1]: # take one out log.info('removing mon %s' % mons[rank]) manager.kill_mon(mons[rank]) manager.wait_for_mon_quorum_size(len(mons) - 1) log.info('causing some monitor log activity') m = 30 for n in range(1, m): manager.raw_cluster_cmd('log', '%d of %d' % (n, m)) log.info('adding mon %s back in' % mons[rank]) manager.revive_mon(mons[rank]) manager.wait_for_mon_quorum_size(len(mons))
def do_check(self): self.info("start checking for clock skews") skews = dict() ran_once = False started_on = time.time() while not self.stopping or (self.at_least_once and not ran_once): if self.at_least_once and not ran_once and self.stopping: if self.at_least_once_timeout > 0.0: assert ( time.time() - started_on < self.at_least_once_timeout ), "failed to obtain a timecheck before timeout expired" quorum_size = len(teuthology.get_mon_names(self.ctx)) self.manager.wait_for_mon_quorum_size(quorum_size) health = self.manager.get_mon_health(True) for timecheck in health["timechecks"]: mon_skew = float(timecheck["skew"]) mon_health = timecheck["health"] mon_id = timecheck["name"] if mon_skew > self.max_skew: assert mon_health == "HEALTH_WARN", "mon.{id} health is '{health}' but skew {s} > max {ms}".format( id=mon_id, s=mon_skew, ms=self.max_skew ) log_str = "mon.{id} with skew {s} > max {ms}".format(id=mon_id, s=mon_skew, ms=self.max_skew) """ add to skew list """ details = timecheck["details"] skews[mon_id] = {"skew": mon_skew, "details": details} if self.expect_skew: self.info("expected skew: {str}".format(str=log_str)) else: self.warn("unexpected skew: {str}".format(str=log_str)) if len(health["timechecks"]) == 0: self.info("no timechecks available just yet") else: ran_once = True if self.check_interval > 0.0: time.sleep(self.check_interval) total = len(skews) if total > 0: self.info("---------- found {n} skews ----------".format(n=total)) for mon_id, values in skews.iteritems(): self.info("mon.{id}: {v}".format(id=mon_id, v=values)) self.info("-------------------------------------") else: self.info("---------- no skews were found ----------") error_str = "" found_error = False if self.expect_skew: if total == 0: error_str = "We were expecting a skew, but none was found!" found_error = True else: if total > 0: error_str = "We were not expecting a skew, but we did find it!" found_error = True if found_error: self.info(error_str) if not self.never_fail: assert False, error_str
def do_check(self): """ Clock skew checker. Loops until finish() is called. """ self.info('start checking for clock skews') skews = dict() ran_once = False started_on = None while not self.stopping or (self.at_least_once and not ran_once): if self.at_least_once and not ran_once and self.stopping: if started_on is None: self.info('kicking-off timeout (if any)') started_on = time.time() elif self.at_least_once_timeout > 0.0: assert time.time() - started_on < self.at_least_once_timeout, \ 'failed to obtain a timecheck before timeout expired' quorum_size = len(teuthology.get_mon_names(self.ctx)) self.manager.wait_for_mon_quorum_size(quorum_size) health = self.manager.get_mon_health(True) timechecks = health['timechecks'] clean_check = False if timechecks['round_status'] == 'finished': assert (timechecks['round'] % 2) == 0, \ 'timecheck marked as finished but round ' \ 'disagrees (r {r})'.format( r=timechecks['round']) clean_check = True else: assert timechecks['round_status'] == 'on-going', \ 'timecheck status expected \'on-going\' ' \ 'but found \'{s}\' instead'.format( s=timechecks['round_status']) if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1: self.info('round still on-going, but there are available reports') else: self.info('no timechecks available just yet') self.sleep_interval() continue assert len(timechecks['mons']) > 1, \ 'there are not enough reported timechecks; ' \ 'expected > 1 found {n}'.format(n=len(timechecks['mons'])) for check in timechecks['mons']: mon_skew = float(check['skew']) mon_health = check['health'] mon_id = check['name'] if abs(mon_skew) > self.max_skew: assert mon_health == 'HEALTH_WARN', \ 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format( id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew) log_str = 'mon.{id} with skew {s} > max {ms}'.format( id=mon_id,s=abs(mon_skew),ms=self.max_skew) """ add to skew list """ details = check['details'] skews[mon_id] = {'skew': mon_skew, 'details': details} if self.expect_skew: self.info('expected skew: {str}'.format(str=log_str)) else: self.warn('unexpected skew: {str}'.format(str=log_str)) if clean_check or (self.expect_skew and len(skews) > 0): ran_once = True self.print_skews(skews) self.sleep_interval() total = len(skews) self.print_skews(skews) error_str = '' found_error = False if self.expect_skew: if total == 0: error_str = 'We were expecting a skew, but none was found!' found_error = True else: if total > 0: error_str = 'We were not expecting a skew, but we did find it!' found_error = True if found_error: self.info(error_str) if not self.never_fail: assert False, error_str