Exemplo n.º 1
0
    def _safe_create(self, rsrc_id, path, data):
        """Create ephemeral node in Zookeeper.

        If the node is present, check if the owner session id is ours, if not,
        fail.
        """
        try:
            zkutils.create(self.zkclient, path, data, ephemeral=True)
            _LOGGER.info('Created node: %s', path)
        except kazoo.client.NodeExistsError:
            content, metadata = zkutils.get_with_metadata(self.zkclient, path)
            session_id, _pwd = self.zkclient.client_id
            if metadata.owner_session_id != session_id:
                _LOGGER.info('Node exists, owned by other: %s - %s - %s', path,
                             content, metadata.owner_session_id)
                self._watch(rsrc_id, path)
                return False

            if content != data:
                _LOGGER.info('Content different: %s - old: %s, new: %s', path,
                             content, data)
                zkutils.update(self.zkclient, path, data)

            _LOGGER.info('Node is up to date: %s - %s', path, session_id)

        return True
Exemplo n.º 2
0
def _node_initialize(tm_env, runtime, zkclient, hostname,
                     zk_server_path, zk_presence_path):
    """Node initialization. Should only be done on a cold start.
    """
    try:
        new_node_info = sysinfo.node_info(tm_env, runtime)

        traitz = zkutils.get(zkclient, z.path.traits())
        new_node_info['traits'] = traits.detect(traitz)

        # Merging scheduler data with node_info data
        node_info = zkutils.get(zkclient, zk_server_path)
        node_info.update(new_node_info)
        _LOGGER.info('Registering node: %s: %s, %r',
                     zk_server_path, hostname, node_info)

        zkutils.update(zkclient, zk_server_path, node_info)
        host_acl = zkutils.make_host_acl(hostname, 'rwcda')
        _LOGGER.debug('host_acl: %r', host_acl)
        zkutils.put(zkclient,
                    zk_presence_path, {'seen': False},
                    acl=[host_acl],
                    ephemeral=True)

        # TODO: Fix the network initialization. Then the below can be part of
        # appenv.initialize()
        if os.name == 'posix':
            # Flush all rules in iptables nat and mangle tables (it is assumed
            # that none but Treadmill manages these tables) and bulk load all
            # the Treadmill static rules
            iptables.initialize(node_info['network']['external_ip'])

    except Exception:  # pylint: disable=W0703
        _LOGGER.exception('Node initialization failed')
        zkclient.stop()
Exemplo n.º 3
0
def _node_initialize(tm_env, runtime, zkclient, hostname, zk_server_path,
                     zk_presence_path):
    """Node initialization. Should only be done on a cold start.
    """
    try:
        new_node_info = sysinfo.node_info(tm_env, runtime)

        # Merging scheduler data with node_info data
        node_info = zkutils.get(zkclient, zk_server_path)
        node_info.update(new_node_info)
        _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname,
                     node_info)

        zkutils.update(zkclient, zk_server_path, node_info)
        host_acl = zkutils.make_host_acl(hostname, 'rwcda')
        _LOGGER.debug('host_acl: %r', host_acl)
        zkutils.put(zkclient,
                    zk_presence_path, {'seen': False},
                    acl=[host_acl],
                    ephemeral=True)

        # Invoke the local node initialization
        tm_env.initialize(node_info)

    except Exception:  # pylint: disable=W0703
        _LOGGER.exception('Node initialization failed')
        zkclient.stop()
Exemplo n.º 4
0
    def test_update_check_content(self):
        """Verifies put/update with check_content=True."""
        treadmill.zkutils.ZkClient.get.return_value = (b'aaa', {})
        zkclient = treadmill.zkutils.ZkClient()
        zkutils.update(zkclient, '/a', 'aaa', check_content=True)
        self.assertFalse(treadmill.zkutils.ZkClient.set.called)

        zkutils.update(zkclient, '/a', 'bbb', check_content=True)
        treadmill.zkutils.ZkClient.set.assert_called_with('/a', b'bbb')
Exemplo n.º 5
0
    def test_update_check_content(self):
        """Verifies put/update with check_content=True."""
        kazoo.client.KazooClient.get.return_value = ('aaa', {})
        zkclient = kazoo.client.KazooClient()
        zkutils.update(zkclient, '/a', 'aaa', check_content=True)
        self.assertFalse(kazoo.client.KazooClient.set.called)

        zkutils.update(zkclient, '/a', 'bbb', check_content=True)
        kazoo.client.KazooClient.set.assert_called_with('/a', 'bbb')
Exemplo n.º 6
0
def register_server(zkclient, hostname, node_info):
    """Register server."""
    server_path = z.path.server(hostname)

    server_data = zkutils.get(zkclient, server_path)
    server_data.update(node_info)

    _LOGGER.info('Registering server %s: %r', hostname, server_data)

    zkutils.update(zkclient, server_path, server_data)

    host_acl = zkutils.make_host_acl(hostname, 'rwcda')
    return zkutils.put(zkclient,
                       z.path.server_presence(hostname + '#'), {'seen': False},
                       acl=[host_acl],
                       ephemeral=True,
                       sequence=True)
Exemplo n.º 7
0
def update_server_parent(zkclient, server_id, parent_id):
    """Update server parent."""
    node = z.path.server(server_id)
    data = zkutils.get(zkclient, node)
    data['parent'] = parent_id

    if zkutils.update(zkclient, node, data, check_content=True):
        create_event(zkclient, 0, 'servers', [server_id])
Exemplo n.º 8
0
def update_server_features(zkclient, server_id, features):
    """Updates server features."""
    node = z.path.server(server_id)
    data = zkutils.get(zkclient, node)
    data['features'] = features

    if zkutils.update(zkclient, node, data, check_content=True):
        create_event(zkclient, 0, 'servers', [server_id])
Exemplo n.º 9
0
def update_server_attrs(zkclient, server_id, partition):
    """Updates server traits."""
    node = z.path.server(server_id)
    data = zkutils.get(zkclient, node)
    data['partition'] = partition

    if zkutils.update(zkclient, node, data, check_content=True):
        create_event(zkclient, 0, 'servers', [server_id])
Exemplo n.º 10
0
def _node_initialize(tm_env, zkclient, hostname, zk_server_path,
                     zk_presence_path):
    """Node initialization. Should only be done on a cold start.
    """
    tm_env.initialize()
    new_node_info = sysinfo.node_info(tm_env)

    # XXX: Why a get/update dance instead of set
    node_info = zkutils.get(zkclient, zk_server_path)
    node_info.update(new_node_info)
    _LOGGER.info('Registering node: %s: %s, %r', zk_server_path, hostname,
                 node_info)

    zkutils.update(zkclient, zk_server_path, node_info)
    host_acl = zkutils.make_host_acl(hostname, 'rwcda')
    _LOGGER.debug('host_acl: %r', host_acl)
    zkutils.put(zkclient,
                zk_presence_path, {'seen': False},
                acl=[host_acl],
                ephemeral=True)
Exemplo n.º 11
0
def update_server_capacity(zkclient, server_id,
                           memory=None, cpu=None, disk=None):
    """Update server capacity."""
    node = z.path.server(server_id)
    data = zkutils.get(zkclient, node)
    if memory:
        data['memory'] = memory
    if cpu:
        data['cpu'] = cpu
    if disk:
        data['disk'] = disk

    if zkutils.update(zkclient, node, data, check_content=True):
        create_event(zkclient, 0, 'servers', [server_id])
Exemplo n.º 12
0
def update_app_priorities(zkclient, updates):
    """Updates app priority."""
    modified = []
    for app_id, priority in six.iteritems(updates):
        assert 0 <= priority <= 100

        app = get_app(zkclient, app_id)
        if app is None:
            # app does not exist.
            continue

        app['priority'] = priority

        if zkutils.update(zkclient, _app_node(app_id), app,
                          check_content=True):
            modified.append(app_id)

    if modified:
        create_event(zkclient, 1, 'apps', modified)
Exemplo n.º 13
0
 def update(self, path, data, check_content=False):
     """Set data into ZK node."""
     try:
         zkutils.update(self.zkclient, path, data, check_content)
     except kazoo.client.NoNodeError:
         raise backend.ObjectNotFoundError()
Exemplo n.º 14
0
def reevaluate(api_url, alert_f, state, zkclient, last_waited):
    """Evaluate state and adjust app count based on monitor"""
    # Disable too many branches/statements warning.
    #
    # pylint: disable=R0912
    # pylint: disable=R0915
    grouped = dict(state['scheduled'])
    monitors = dict(state['monitors'])

    # Do not create a copy, suspended is accessed by ref.
    suspended = state['suspended']
    waited = {}
    modified = False

    now = time.time()

    # remove outdated information in suspended dict
    extra = six.viewkeys(suspended) - six.viewkeys(monitors)
    for name in extra:
        suspended.pop(name, None)
        modified = True

    # Increase available tokens.
    for name, conf in six.iteritems(monitors):

        if suspended.get(name, 0) > now:
            _LOGGER.debug('Ignoring app %s - suspended.', name)
            continue

        # Either app is not suspended or it is past-due - remove it from
        # suspended dict.
        if suspended.pop(name, None) is not None:
            alert_f(name, 'Monitor active again', status='clear')
            modified = True

        # Max value reached, nothing to do.
        max_value = conf['count'] * 2
        available = conf['available']
        if available < max_value:
            delta = conf['rate'] * (now - conf['last_update'])
            conf['available'] = min(available + delta, max_value)

        conf['last_update'] = now

    for name, conf in six.iteritems(monitors):

        if suspended.get(name, 0) > now:
            _LOGGER.debug('Monitor is suspended for: %s.', name)
            continue

        count = conf['count']
        available = conf['available']

        current_count = len(grouped.get(name, []))
        _LOGGER.debug('App: %r current: %d, target %d', name, current_count,
                      count)

        if count == current_count:
            continue

        elif count > current_count:
            needed = count - current_count
            allowed = int(min(needed, math.floor(available)))
            _LOGGER.debug('%s => need %d, allow %d', name, needed, allowed)
            if allowed <= 0:
                # in this case available <= 0 as needed >= 1
                # we got estimated wait time, now + wait seconds
                waited[name] = now + int((1 - available) / conf['rate'])
                # new wait item, need modify
                if name not in last_waited:
                    alert_f(name, 'Monitor suspended: Rate limited')
                    modified = True

                continue

            try:
                # scheduled, remove app from waited list
                _scheduled = restclient.post(
                    [api_url],
                    '/instance/{}?count={}'.format(name, allowed),
                    payload={},
                    headers={'X-Treadmill-Trusted-Agent': 'monitor'})

                if name in last_waited:
                    # this means app jump out of wait, need to clear it from zk
                    alert_f(name, 'Monitor active again', status='clear')
                    modified = True

                conf['available'] -= allowed
            except restclient.NotFoundError:
                _LOGGER.info('App not configured: %s', name)
                suspended[name] = now + _DELAY_INTERVAL
                alert_f(name, 'Monitor suspended: App not configured')
                modified = True
            except restclient.BadRequestError:
                _LOGGER.exception('Unable to start: %s', name)
                suspended[name] = now + _DELAY_INTERVAL
                alert_f(name, 'Monitor suspended: Unable to start')
                modified = True
            except restclient.ValidationError:
                _LOGGER.exception('Invalid manifest: %s', name)
                suspended[name] = now + _DELAY_INTERVAL
                alert_f(name, 'Monitor suspended: Invalid manifest')
                modified = True
            except Exception:  # pylint: disable=W0703
                _LOGGER.exception('Unable to create instances: %s: %s', name,
                                  needed)

        elif count < current_count:
            extra = []
            policy = conf.get('policy')
            if policy is None:
                policy = 'fifo'

            if policy == 'fifo':
                extra = grouped[name][:current_count - count]
            elif policy == 'lifo':
                extra = grouped[name][count - current_count:]
            else:
                _LOGGER.warning('Invalid scale policy: %s', policy)
                continue

            try:
                response = restclient.post(
                    [api_url],
                    '/instance/_bulk/delete',
                    payload=dict(instances=list(extra)),
                    headers={'X-Treadmill-Trusted-Agent': 'monitor'})
                _LOGGER.info('deleted: %r - %s', extra, response)

                # this means we reduce the count number, no need to wait
                modified = True

            except Exception:  # pylint: disable=W0703
                _LOGGER.exception('Unable to delete instances: %r', extra)

    # total inactive means
    waited.update(suspended)
    if modified:
        _LOGGER.info('Updating suspended app monitors')
        zkutils.update(zkclient, z.path.appmonitor(), waited)

    return waited