class IKazooClient(metaclass=Singleton):

    # zk_client = None

    def __init__(self) -> None:
        super().__init__()
        self.zk_client = KazooClient(hosts=ZK_PATH)
        self.zk_client.start()

    def get_kazoo_client(self):
        """这里加异常捕捉 防止本服务不可以用"""
        while self.zk_client is None or (self.zk_client.state in [
                KeeperState.CLOSED, KeeperState.AUTH_FAILED, KazooState.LOST
        ]):
            try:
                if self.zk_client.state in [
                        KeeperState.CLOSED, KeeperState.AUTH_FAILED,
                        KazooState.LOST
                ]:
                    print("ERROR : zookeeper  尝试重连")
                    self.zk_client.restart()
                    # 重新注册服务
                    # re_register()
                time.sleep(1)
            except Exception:
                print(self.zk_client)
                if self.zk_client is not None:
                    print(self.zk_client.state)
                print(
                    "ERROR : zookeeper  连接失败  socket 节点选取将不可以用!!!!!!!!!!!!!!!!!!!"
                )
        return self.zk_client
示例#2
0
class zkUtils:
    """Zookeeper utility class implemented with Kazoo"""
    def __init__(self,
                 hosts=DEFAULT_ZK,
                 read_only=True,
                 logging_level=logging.INFO):
        self.zc = KazooClient(hosts=hosts, read_only=read_only)
        self.logger = get_logger(LOGGER_NAME, logging_level)
        self.logger.info(
            "Zookeeper hosts: {}, client util instantiated.".format(hosts))

    def get_path_data(self, path="/"):
        try:
            self.zc.restart()
            if self.zc.exists(path):
                return self.zc.get(path)[0]
            else:
                return None
        except Exception as e:
            self.logger.exception("Get path data exception: {}!".format(
                str(e)))
            return None

    def get_children_list(self, path="/"):
        try:
            self.zc.restart()
            if self.zc.exists(path):
                return self.zc.get_children(path)
            else:
                return None
        except Exception as e:
            self.logger.exception("Get path data exception: {}!".format(
                str(e)))
            return None
示例#3
0
class ZKBase(object):
    """
    scheduler需要定义以下函数:
    - callback(KazooState.*): 处理CONN: LOST,SUSPENDED
    - 
    """

    def __init__(self,conf,scheduler):
        self.conf = conf
        self.state = "STOPPED" 
        self.zk = KazooClient(self.conf.processor.zk_url, timeout=self.conf.processor.timeout)
        try:
            self.zk.start()
        except Exception:
            LOG.error("Start zk error: %s" % traceback.format_exc(limit=2))
        self.zk.add_listener(self.listen)
        self.scheduler = scheduler
        self.state = "RUNNING" # STOPPED
        self.state_changes = False
        
    def listen(self, state):
        if state == KazooState.LOST or state == KazooState.SUSPENDED:
            LOG.error("Session state change: %s" % state)
            if self.state == "STOPPED":
                return
            self.state_changes = True
            self.scheduler.callback(state)
        elif state == KazooState.CONNECTED:
            LOG.info("Connectied to ZK.")
        else:
            LOG.error("Session timeout. Cannot conect to ZK.")

    def reinit(self,state):
        while self.zk.state != KazooState.CONNECTED:
            LOG.info("Restart zk connection until connected.")
            try:
                self.zk.restart()
            except Exception:
                LOG.error("Reinit: %s" % traceback.format_exc(limit=1))
        self.state_changes = False

    def create(self,path, value='', acl=None, ephemeral=False, sequence=False, makepath=False):
        LOG.info("Create: path %s" % path)
        self.zk.create(path=path,
                        value=value.encode(),
                        ephemeral=ephemeral,
                        sequence=sequence,
                        makepath=makepath)

    def get(self,path,watch=None):
        self.zk.get(path,watch)

    def get_children(self,path,watch=None):
        return self.zk.get_children(path,watch=watch)

    def terminate(self):
        LOG.info("Terminate ZK connection.")
        self.state = "STOPPED"
        self.zk.stop()
        self.zk.close()
示例#4
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1},
                                   command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):

        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    def set_retry_timeout(self, retry_timeout):
        self._client._retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (index is None and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def set_config_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return index is None and self._create(self.config_path, value)
        except Exception:
            logging.exception('set_config_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member:
            if data == self._my_member_data:
                return True
        else:
            try:
                self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1)
                self._my_member_data = data
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, data).get(timeout=1)
            self._my_member_data = data
            return True
        except:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            try:
                self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1)
                self._last_leader_operation = last_operation
            except NoNodeError:
                try:
                    self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1)
                    self._last_leader_operation = last_operation
                except:
                    logger.exception('Failed to create %s', self.leader_optime_path)
            except:
                logger.exception('Failed to update %s', self.leader_optime_path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
示例#5
0
class ZooKeeper(AbstractDCS):
    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        mapping = {
            'use_ssl': 'use_ssl',
            'verify': 'verify_certs',
            'cacert': 'ca',
            'cert': 'certfile',
            'key': 'keyfile',
            'key_password': '******'
        }
        kwargs = {v: config[k] for k, v in mapping.items() if k in config}

        self._client = KazooClient(
            hosts,
            handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
            timeout=config['ttl'],
            connection_retry=KazooRetry(max_delay=1,
                                        max_tries=-1,
                                        sleep_func=time.sleep),
            command_retry=KazooRetry(deadline=config['retry_timeout'],
                                     max_delay=1,
                                     max_tries=-1,
                                     sleep_func=time.sleep),
            **kwargs)
        self._client.add_listener(self.session_listener)

        self._fetch_cluster = True
        self._fetch_optime = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, *args):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(*args)
        return max(self.loop_wait - 2, 2) * 1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def optime_watcher(self, event):
        self._fetch_optime = True
        self.event.set()

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.optime_watcher(event)

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    @property
    def ttl(self):
        return self._client._session_timeout

    def set_retry_timeout(self, retry_timeout):
        retry = self._client.retry if isinstance(
            self._client.retry, KazooRetry) else self._client._retry
        retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    def get_leader_optime(self, leader):
        watch = self.optime_watcher if not leader or leader.name != self._name else None
        optime = self.get_node(self.leader_optime_path, watch)
        self._fetch_optime = False
        return optime and int(optime[0]) or 0

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner,
                                value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self, sync_standby):
        members = []
        for member in self.get_children(self.members_path,
                                        self.cluster_watcher):
            watch = member in sync_standby and self.cluster_watcher or None
            data = self.get_node(self.members_path + member, watch)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(
            self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path)
                      or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(
            self.config_path,
            watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version,
                                                    config[0], config[1].mzxid)

        # get timeline history
        history = self.get_node(
            self.history_path,
            watch=self.cluster_watcher) if self._HISTORY in nodes else None
        history = history and TimelineHistory.from_node(
            history[1].mzxid, history[0])

        # get synchronization state
        sync = self.get_node(
            self.sync_path,
            watch=self.cluster_watcher) if self._SYNC in nodes else None
        sync = SyncState.from_node(sync and sync[1].version, sync and sync[0])

        # get list of members
        sync_standby = sync.leader == self._name and sync.members or []
        members = self.load_members(
            sync_standby) if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(
            self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]]
                          or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner,
                                member)
                self._fetch_cluster = member.index == -1

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self.get_leader_optime(
            leader)

        # failover key
        failover = self.get_node(
            self.failover_path,
            watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version,
                                                   failover[0])

        return Cluster(initialize, config, leader, last_leader_operation,
                       members, failover, sync, history)

    def _load_cluster(self):
        cluster = self.cluster
        if self._fetch_cluster or cluster is None:
            try:
                cluster = self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        # Optime ZNode was updated or doesn't exist and we are not leader
        elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\
                not (cluster.leader and cluster.leader.name == self._name):
            try:
                optime = self.get_leader_optime(cluster.leader)
                cluster = Cluster(cluster.initialize, cluster.config,
                                  cluster.leader, optime, cluster.members,
                                  cluster.failover, cluster.sync,
                                  cluster.history)
            except Exception:
                pass
        return cluster

    def _bypass_caches(self):
        self._fetch_cluster = True

    def _create(self, path, value, retry=False, ephemeral=False):
        try:
            if retry:
                self._client.retry(self._client.create,
                                   path,
                                   value,
                                   makepath=True,
                                   ephemeral=ephemeral)
            else:
                self._client.create_async(path,
                                          value,
                                          makepath=True,
                                          ephemeral=ephemeral).get(timeout=1)
            return True
        except Exception:
            logger.exception('Failed to create %s', path)
        return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path,
                           self._name.encode('utf-8'),
                           retry=True,
                           ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def _set_or_create(self,
                       key,
                       value,
                       index=None,
                       retry=False,
                       do_not_create_empty=False):
        value = value.encode('utf-8')
        try:
            if retry:
                self._client.retry(self._client.set,
                                   key,
                                   value,
                                   version=index or -1)
            else:
                self._client.set_async(key, value, version=index
                                       or -1).get(timeout=1)
            return True
        except NoNodeError:
            if do_not_create_empty and not value:
                return True
            elif index is None:
                return self._create(key, value, retry)
            else:
                return False
        except Exception:
            logger.exception('Failed to update %s', key)
        return False

    def set_failover_value(self, value, index=None):
        return self._set_or_create(self.failover_path, value, index)

    def set_config_value(self, value, index=None):
        return self._set_or_create(self.config_path, value, index, retry=True)

    def initialize(self, create_new=True, sysid=""):
        sysid = sysid.encode('utf-8')
        return self._create(self.initialize_path, sysid, retry=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path, sysid)

    def touch_member(self, data, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8')
        if member and (self._client.client_id is not None
                       and member.session != self._client.client_id[0] or
                       not (deep_compare(member.data.get('tags', {}),
                                         data.get('tags', {})) and
                            member.data.get('version') == data.get('version')
                            and member.data.get('checkpoint_after_promote')
                            == data.get('checkpoint_after_promote'))):
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except Exception:
                return False
            member = None

        if member:
            if deep_compare(data, member.data):
                return True
        else:
            try:
                self._client.create_async(
                    self.member_path,
                    encoded_data,
                    makepath=True,
                    ephemeral=not permanent).get(timeout=1)
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path,
                                   encoded_data).get(timeout=1)
            return True
        except Exception:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        return self._set_or_create(self.leader_optime_path, last_operation)

    def _update_leader(self):
        return True

    def _delete_leader(self):
        self._client.restart()
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except Exception:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete,
                                      self.client_path(''),
                                      recursive=True)
        except NoNodeError:
            return True

    def set_history_value(self, value):
        return self._set_or_create(self.history_path, value)

    def set_sync_state_value(self, value, index=None):
        return self._set_or_create(self.sync_path,
                                   value,
                                   index,
                                   retry=True,
                                   do_not_create_empty=True)

    def delete_sync_state(self, index=None):
        return self.set_sync_state_value("{}", index)

    def watch(self, leader_index, timeout):
        if super(ZooKeeper, self).watch(leader_index,
                                        timeout) and not self._fetch_optime:
            self._fetch_cluster = True
        return self._fetch_cluster
class ZooKeeper(AbstractDCS):

    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self._client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout') or 30),
                                   command_retry={'deadline': (config.get('reconnect_timeout') or 10),
                                                  'max_delay': 1, 'max_tries': -1},
                                   connection_retry={'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._client.start()

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        if failover:
            failover = Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self._client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=True)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (not index and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        path = self.member_path
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.retry(self._client.delete, path)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member and data == self._my_member_data:
            return True

        try:
            if member:
                self._client.retry(self._client.set, path, data)
            else:
                self._client.retry(self._client.create, path, data, makepath=True, ephemeral=True)
            self._my_member_data = data
            return True
        except NodeExistsError:
            try:
                self._client.retry(self._client.set, path, data)
                self._my_member_data = data
                return True
            except:
                logger.exception('touch_member')
        except:
            logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            self._last_leader_operation = last_operation
            path = self.leader_optime_path
            try:
                self._client.retry(self._client.set, path, last_operation)
            except NoNodeError:
                try:
                    self._client.retry(self._client.create, path, last_operation, makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
示例#7
0
class ServiceRegister(object):
    def __init__(self, hosts="127.0.0.1:2181", read_only=True, logger=None):
        """
        服务注册
        :param hosts: Zookeeper集群地址列表
        :param read_only: 是否只读
        :param logger: 日志对象
        """
        if not logger:
            import logging
            logging.basicConfig()
        self._zk = KazooClient(hosts, read_only=read_only, logger=logger)
        self._zk.start()

    def restart(self):
        self._zk.restart()

    def retry_get(self, path, watcher=None):
        """
        重读
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        return self._zk.retry(self.get, path, watcher)

    def lock(self, path, identifier, timeout=None):
        """
        分布式锁
        :param path: 路由
        :param identifier: 锁标识
        :param timeout: 超时时间
        :return: 锁对象
        """
        return DLock(self._zk, path, identifier, timeout)

    def exist(self, path):
        """
        节点是否存在
        :param path: 路由
        :return: 存在返回True,不存在返回False。
        """
        state = self._zk.exists(path)
        return state is not None

    def create(self, path, value=""):
        """
        创建节点
        :param path: 节点路由
        :param value: 节点值
        :return: 节点路由
        """
        try:
            res_path = self._zk.create(path, value, makepath=True)
        except NodeExistsError:
            return path
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return res_path

    def get(self, path, watcher=None):
        """
        查节点值
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        try:
            data, state = self._zk.get(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return e.message, -2
        except ZookeeperError as e:
            return e.message, -3
        else:
            return data, state.version

    def get_children(self, path, watcher=None):
        """
        查子节点列表
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 子节点列表
        """
        try:
            data = self._zk.get_children(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return [], -2
        except ZookeeperError as e:
            return [], -3
        else:
            return data, 0

    def set(self, path, value, version=-1):
        """
        改节点值
        :param path: 节点路由
        :param value: 节点值
        :param version: 成功:版本号;失败:异常信息。
        """
        try:
            state = self._zk.set(path, value, version)
        except BadVersionError as e:
            return e.message
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return state.version
    def run(self):
        server_info = self.mac_and_ip(os.environ.get('MONITOR_INTERFACE', 'eth0'))

        zk_path = '/remote/alive/workstation/%s' % server_info['mac']
        zk = KazooClient(hosts=os.environ.get('ZOOKEEPER', 'zookeeper_server:2181'))
        sleep_time = 10
        while not zk.connected:
            try:
                zk.start()
            except:
                pass
            time.sleep(sleep_time)
        web_keyname, has_error, device_loop, loop = 'api', False, int(10 / sleep_time), 0
        server_info[web_keyname] = {}
        while True:
            try:
                if not zk.connected:
                    zk.restart()
                    logger.info('Restart zk connection!')
                if not zk.exists(zk_path):
                    value = json.dumps(server_info)
                    zk.create(zk_path, value, ephemeral=True, makepath=True)
                    logger.info('Create ZK %s: %s' % (zk_path, value))
                server_info[web_keyname].update({'port': int(os.environ.get('MONITOR_PORT', 80)), 'path': '/api'})
                url = 'http://%s:%d' % (server_info['ip'], server_info[web_keyname]['port'])

                try:
                    r = requests.get('%s/api/ping' % url)
                    if r.status_code == 200:
                        server_info[web_keyname]['status'] = 'up'
                    else:
                        logger.error("%s --- %s" % (r.url, r.text))
                except Error as e:
                    logger.error(e)
                    server_info[web_keyname]['status'] = 'down'
                    server_info[web_keyname]['devices'] = {}
                    server_info[web_keyname]['jobs'] = []
                else:
                    if loop == 0:  # we want not to query devices so frequently, so...
                        devices = requests.get('%s/api/0/devices' % url)
                        if devices.status_code == 200:
                            server_info[web_keyname]['devices'] = devices.json()
                        else:
                            logger.error("%s --- %s" % (devices.url, devices.text))
                            server_info[web_keyname]['devices'] = {}

                    jobs = requests.get('%s/api/0/jobs' % url, params={'all': False})
                    if jobs.status_code == 200:
                        server_info[web_keyname]['jobs'] = jobs.json()['jobs']
                    else:
                        logger.error("%s --- %s" % (jobs.url, jobs.text))
                        server_info[web_keyname]['jobs'] = []

                data, stat = zk.get(zk_path)
                if data and json.loads(data) != server_info:
                    value = json.dumps(server_info)
                    zk.set(zk_path, value)
                    logger.info('Update ZK %s: %s' % (zk_path, value))
            except:
                loop = 0
                if not has_error:
                    logger.error("Connection error!")
                    has_error = True
            else:
                has_error = False
                loop = (loop + 1) % device_loop
            time.sleep(sleep_time)