示例#1
0
def task(ctx, config):
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mon_clock_skew_check task only accepts a dict for configuration'
    interval = float(config.get('interval', 30.0))
    expect_skew = config.get('expect-skew', False)

    log.info('Beginning mon_clock_skew_check...')
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.keys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    quorum_size = len(teuthology.get_mon_names(ctx))
    manager.wait_for_mon_quorum_size(quorum_size)

    # wait a bit
    log.info('sleeping for {s} seconds'.format(s=interval))
    time.sleep(interval)

    health = manager.get_mon_health(True)
    log.info('got health %s' % health)
    if expect_skew:
        if 'MON_CLOCK_SKEW' not in health['checks']:
            raise RuntimeError('expected MON_CLOCK_SKEW but got none')
    else:
        if 'MON_CLOCK_SKEW' in health['checks']:
            raise RuntimeError('got MON_CLOCK_SKEW but expected none')
示例#2
0
def test_get_mon_names():
    expected = [
        ([['mon.a', 'osd.0', 'mon.c']], 'ceph', ['mon.a', 'mon.c']),
        ([['ceph.mon.a', 'osd.0', 'ceph.mon.c']], 'ceph', ['ceph.mon.a', 'ceph.mon.c']),
        ([['mon.a', 'osd.0', 'mon.c'], ['ceph.mon.b']], 'ceph', ['mon.a', 'mon.c', 'ceph.mon.b']),
        ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'ceph', ['mon.a', 'mon.c']),
        ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'foo', ['foo.mon.a']),
    ]
    for remote_roles, cluster_name, expected_mons in expected:
        ctx = argparse.Namespace()
        ctx.cluster = Mock()
        ctx.cluster.remotes = {i: roles for i, roles in enumerate(remote_roles)}
        mons = misc.get_mon_names(ctx, cluster_name)
        assert expected_mons == mons
示例#3
0
def _get_next_port(ctx, ip, cluster):
    # assuming we have only one cluster here.
    used = []
    for name in teuthology.get_mon_names(ctx, cluster):
        addr = ctx.ceph[cluster].conf[name]['mon addr']
        mon_ip, mon_port = addr.split(':')
        if mon_ip != ip:
            continue
        used.append(int(mon_port))
    port = 6789
    used.sort()
    for p in used:
        if p != port:
            break
        port += 1
    return port
示例#4
0
def _get_next_port(ctx, ip, cluster):
    # assuming we have only one cluster here.
    used = []
    for name in teuthology.get_mon_names(ctx, cluster):
        addr = ctx.ceph[cluster].conf[name]['mon addr']
        mon_ip, mon_port = addr.split(':')
        if mon_ip != ip:
            continue
        used.append(int(mon_port))
    port = 6789
    used.sort()
    for p in used:
        if p != port:
            break
        port += 1
    return port
示例#5
0
def _get_mons(ctx):
    """
    Get monitor names from the context value.
    """
    mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
    return mons
示例#6
0
def _get_mons(ctx):
  mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
  return mons
示例#7
0
def task(ctx, config):
    """
    Test monitor recovery.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
    log.info("mon ids = %s" % mons)

    manager.wait_for_mon_quorum_size(len(mons))

    log.info('verifying all monitors are in the quorum')
    for m in mons:
        s = manager.get_mon_status(m)
        assert s['state'] == 'leader' or s['state'] == 'peon'
        assert len(s['quorum']) == len(mons)

    log.info('restarting each monitor in turn')
    for m in mons:
        # stop a monitor
        manager.kill_mon(m)
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        # restart
        manager.revive_mon(m)
        manager.wait_for_mon_quorum_size(len(mons))

    # in forward and reverse order,
    rmons = mons
    rmons.reverse()
    for mons in mons, rmons:
        log.info('stopping all monitors')
        for m in mons:
            manager.kill_mon(m)

        log.info('forming a minimal quorum for %s, then adding monitors' %
                 mons)
        qnum = (len(mons) // 2) + 1
        num = 0
        for m in mons:
            manager.revive_mon(m)
            num += 1
            if num >= qnum:
                manager.wait_for_mon_quorum_size(num)

    # on both leader and non-leader ranks...
    for rank in [0, 1]:
        # take one out
        log.info('removing mon %s' % mons[rank])
        manager.kill_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        log.info('causing some monitor log activity')
        m = 30
        for n in range(1, m):
            manager.raw_cluster_cmd('log', '%d of %d' % (n, m))

        log.info('adding mon %s back in' % mons[rank])
        manager.revive_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons))
示例#8
0
  def do_check(self):
    self.info('start checking for clock skews')
    skews = dict()
    ran_once = False
    started_on = None

    while not self.stopping or (self.at_least_once and not ran_once):

      if self.at_least_once and not ran_once and self.stopping:
        if started_on is None:
          self.info('kicking-off timeout (if any)')
          started_on = time.time()
        elif self.at_least_once_timeout > 0.0:
          assert time.time() - started_on < self.at_least_once_timeout, \
              'failed to obtain a timecheck before timeout expired'

      quorum_size = len(teuthology.get_mon_names(self.ctx))
      self.manager.wait_for_mon_quorum_size(quorum_size)

      health = self.manager.get_mon_health(True)
      timechecks = health['timechecks']

      clean_check = False

      if timechecks['round_status'] == 'finished':
        assert (timechecks['round'] % 2) == 0, \
            'timecheck marked as finished but round ' \
            'disagrees (r {r})'.format(
                r=timechecks['round'])
        clean_check = True
      else:
        assert timechecks['round_status'] == 'on-going', \
            'timecheck status expected \'on-going\' ' \
            'but found \'{s}\' instead'.format(
                s=timechecks['round_status'])
        if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
          self.info('round still on-going, but there are available reports')
        else:
          self.info('no timechecks available just yet')
          self.sleep_interval()
          continue

      assert len(timechecks['mons']) > 1, \
          'there are not enough reported timechecks; ' \
          'expected > 1 found {n}'.format(n=len(timechecks['mons']))

      for check in timechecks['mons']:
        mon_skew = float(check['skew'])
        mon_health = check['health']
        mon_id = check['name']
        if abs(mon_skew) > self.max_skew:
          assert mon_health == 'HEALTH_WARN', \
              'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
                  id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)

          log_str = 'mon.{id} with skew {s} > max {ms}'.format(
            id=mon_id,s=abs(mon_skew),ms=self.max_skew)

          """ add to skew list """
          details = check['details']
          skews[mon_id] = {'skew': mon_skew, 'details': details}

          if self.expect_skew:
            self.info('expected skew: {str}'.format(str=log_str))
          else:
            self.warn('unexpected skew: {str}'.format(str=log_str))

      if clean_check or (self.expect_skew and len(skews) > 0):
        ran_once = True
        self.print_skews(skews)
      self.sleep_interval()

    total = len(skews)
    self.print_skews(skews)

    error_str = ''
    found_error = False

    if self.expect_skew:
      if total == 0:
        error_str = 'We were expecting a skew, but none was found!'
        found_error = True
    else:
      if total > 0:
        error_str = 'We were not expecting a skew, but we did find it!'
        found_error = True

    if found_error:
      self.info(error_str)
      if not self.never_fail:
        assert False, error_str
示例#9
0
def _get_mons(ctx):
    return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
def task(ctx, config):
    """
    Test monitor recovery.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
    log.info("mon ids = %s" % mons)

    manager.wait_for_mon_quorum_size(len(mons))

    log.info('verifying all monitors are in the quorum')
    for m in mons:
        s = manager.get_mon_status(m)
        assert s['state'] == 'leader' or s['state'] == 'peon'
        assert len(s['quorum']) == len(mons)

    log.info('restarting each monitor in turn')
    for m in mons:
        # stop a monitor
        manager.kill_mon(m)
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        # restart
        manager.revive_mon(m)
        manager.wait_for_mon_quorum_size(len(mons))

    # in forward and reverse order,
    rmons = mons
    rmons.reverse()
    for mons in mons, rmons:
        log.info('stopping all monitors')
        for m in mons:
            manager.kill_mon(m)

        log.info('forming a minimal quorum for %s, then adding monitors' % mons)
        qnum = (len(mons) / 2) + 1
        num = 0
        for m in mons:
            manager.revive_mon(m)
            num += 1
            if num >= qnum:
                manager.wait_for_mon_quorum_size(num)

    # on both leader and non-leader ranks...
    for rank in [0, 1]:
        # take one out
        log.info('removing mon %s' % mons[rank])
        manager.kill_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        log.info('causing some monitor log activity')
        m = 30
        for n in range(1, m):
            manager.raw_cluster_cmd('log', '%d of %d' % (n, m))

        log.info('adding mon %s back in' % mons[rank])
        manager.revive_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons))
示例#11
0
    def do_check(self):
        self.info("start checking for clock skews")
        skews = dict()
        ran_once = False
        started_on = time.time()

        while not self.stopping or (self.at_least_once and not ran_once):

            if self.at_least_once and not ran_once and self.stopping:
                if self.at_least_once_timeout > 0.0:
                    assert (
                        time.time() - started_on < self.at_least_once_timeout
                    ), "failed to obtain a timecheck before timeout expired"

            quorum_size = len(teuthology.get_mon_names(self.ctx))
            self.manager.wait_for_mon_quorum_size(quorum_size)

            health = self.manager.get_mon_health(True)
            for timecheck in health["timechecks"]:
                mon_skew = float(timecheck["skew"])
                mon_health = timecheck["health"]
                mon_id = timecheck["name"]
                if mon_skew > self.max_skew:
                    assert mon_health == "HEALTH_WARN", "mon.{id} health is '{health}' but skew {s} > max {ms}".format(
                        id=mon_id, s=mon_skew, ms=self.max_skew
                    )

                    log_str = "mon.{id} with skew {s} > max {ms}".format(id=mon_id, s=mon_skew, ms=self.max_skew)

                    """ add to skew list """
                    details = timecheck["details"]
                    skews[mon_id] = {"skew": mon_skew, "details": details}

                    if self.expect_skew:
                        self.info("expected skew: {str}".format(str=log_str))
                    else:
                        self.warn("unexpected skew: {str}".format(str=log_str))

            if len(health["timechecks"]) == 0:
                self.info("no timechecks available just yet")
            else:
                ran_once = True

            if self.check_interval > 0.0:
                time.sleep(self.check_interval)

        total = len(skews)
        if total > 0:
            self.info("---------- found {n} skews ----------".format(n=total))
            for mon_id, values in skews.iteritems():
                self.info("mon.{id}: {v}".format(id=mon_id, v=values))
            self.info("-------------------------------------")
        else:
            self.info("---------- no skews were found ----------")

        error_str = ""
        found_error = False

        if self.expect_skew:
            if total == 0:
                error_str = "We were expecting a skew, but none was found!"
                found_error = True
        else:
            if total > 0:
                error_str = "We were not expecting a skew, but we did find it!"
                found_error = True

        if found_error:
            self.info(error_str)
            if not self.never_fail:
                assert False, error_str
示例#12
0
def _get_mons(ctx):
    return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
    def do_check(self):
        """
        Clock skew checker.  Loops until finish() is called.
        """
        self.info('start checking for clock skews')
        skews = dict()
        ran_once = False
        
        started_on = None

        while not self.stopping or (self.at_least_once and not ran_once):

            if self.at_least_once and not ran_once and self.stopping:
                if started_on is None:
                    self.info('kicking-off timeout (if any)')
                    started_on = time.time()
                elif self.at_least_once_timeout > 0.0:
                    assert time.time() - started_on < self.at_least_once_timeout, \
                        'failed to obtain a timecheck before timeout expired'

            quorum_size = len(teuthology.get_mon_names(self.ctx))
            self.manager.wait_for_mon_quorum_size(quorum_size)

            health = self.manager.get_mon_health(True)
            timechecks = health['timechecks']

            clean_check = False

            if timechecks['round_status'] == 'finished':
                assert (timechecks['round'] % 2) == 0, \
                    'timecheck marked as finished but round ' \
                    'disagrees (r {r})'.format(
                        r=timechecks['round'])
                clean_check = True
            else:
                assert timechecks['round_status'] == 'on-going', \
                        'timecheck status expected \'on-going\' ' \
                        'but found \'{s}\' instead'.format(
                            s=timechecks['round_status'])
                if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
                    self.info('round still on-going, but there are available reports')
                else:
                    self.info('no timechecks available just yet')
                    self.sleep_interval()
                    continue

            assert len(timechecks['mons']) > 1, \
                'there are not enough reported timechecks; ' \
                'expected > 1 found {n}'.format(n=len(timechecks['mons']))

            for check in timechecks['mons']:
                mon_skew = float(check['skew'])
                mon_health = check['health']
                mon_id = check['name']
                if abs(mon_skew) > self.max_skew:
                    assert mon_health == 'HEALTH_WARN', \
                        'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
                            id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)

                    log_str = 'mon.{id} with skew {s} > max {ms}'.format(
                        id=mon_id,s=abs(mon_skew),ms=self.max_skew)

                    """ add to skew list """
                    details = check['details']
                    skews[mon_id] = {'skew': mon_skew, 'details': details}

                    if self.expect_skew:
                        self.info('expected skew: {str}'.format(str=log_str))
                    else:
                        self.warn('unexpected skew: {str}'.format(str=log_str))

            if clean_check or (self.expect_skew and len(skews) > 0):
                ran_once = True
                self.print_skews(skews)
            self.sleep_interval()

        total = len(skews)
        self.print_skews(skews)

        error_str = ''
        found_error = False

        if self.expect_skew:
            if total == 0:
                error_str = 'We were expecting a skew, but none was found!'
                found_error = True
        else:
            if total > 0:
                error_str = 'We were not expecting a skew, but we did find it!'
                found_error = True

        if found_error:
            self.info(error_str)
            if not self.never_fail:
                assert False, error_str